[llvm-branch-commits] [clang] [lld] [lldb] [llvm] [mlir] clang/HIP: Remove REQUIRES libgcc from a test (PR #112412)

Matt Arsenault via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Oct 16 08:25:44 PDT 2024


Andrzej =?utf-8?q?Warzyński?= <andrzej.warzynski at arm.com>,Kazu Hirata
 <kazu at google.com>,Petar Avramovic <Petar.Avramovic at amd.com>,Christudasan
 Devadasan <christudasan.devadasan at amd.com>,Christudasan Devadasan
 <christudasan.devadasan at amd.com>,Howard Roark
 <howard.roark at huawei-partners.com>,Christudasan Devadasan
 <christudasan.devadasan at amd.com>,Antonio Frighetto <me at antoniofrighetto.com>,Antonio
 Frighetto <me at antoniofrighetto.com>,Christudasan Devadasan
 <christudasan.devadasan at amd.com>,Hans <hans at hanshq.net>,
=?utf-8?q?Balázs_Kéri?= <balazs.keri at ericsson.com>,Yingwei Zheng
 <dtcxzyw2333 at gmail.com>,Ilya Biryukov <ibiryukov at google.com>,Jonas Paulsson
 <paulson1 at linux.ibm.com>,Daniel Grumberg <dgrumberg at apple.com>,Timm Baeder
 <tbaeder at redhat.com>,Amr Hesham <amr96 at programmer.net>,Amr Hesham
 <amr96 at programmer.net>,Benjamin Maxwell <benjamin.maxwell at arm.com>,Karl-Johan
 Karlsson <karl-johan.karlsson at ericsson.com>,Mehdi Amini <joker.eph at gmail.com>
 =?utf-8?q?,?=Luke Lau <luke at igalia.com>,Akshat Oke <Akshat.Oke at amd.com>,Sergio
 Afonso <safonsof at amd.com>,Simon Camphausen
 <simon.camphausen at iml.fraunhofer.de>,Youngsuk Kim <youngsuk.kim at hpe.com>,Simon
 Pilgrim <llvm-dev at redking.me.uk>,Simon Pilgrim <llvm-dev at redking.me.uk>,Simon
 Pilgrim <llvm-dev at redking.me.uk>,Simon Pilgrim <llvm-dev at redking.me.uk>,
Stefan =?utf-8?q?Gränitz?= <stefan.graenitz at gmail.com>,Yingwei Zheng
 <dtcxzyw2333 at gmail.com>,Ramkumar Ramachandra
 <ramkumar.ramachandra at codasip.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,Karl-Johan Karlsson
 <karl-johan.karlsson at ericsson.com>,Sven van Haastregt
 <sven.vanhaastregt at arm.com>,Lewis Crawford <lcrawford at nvidia.com>,Simon
 Pilgrim <llvm-dev at redking.me.uk>,Sander de Smalen <sander.desmalen at arm.com>,Benjamin
 Kramer <benny.kra at googlemail.com>,Yuta Saito <kateinoigakukun at gmail.com>,Michael
 Buch <michaelbuch12 at gmail.com>,Sirui Mu <msrlancern at gmail.com>,VladiKrapp-Arm
 <vladi.krapp at arm.com>,Nico Weber <thakis at chromium.org>,LLVM GN Syncbot
 <llvmgnsyncbot at gmail.com>,"A. Jiang" <de34 at live.cn>,Simon Pilgrim
 <llvm-dev at redking.me.uk>,"A. Jiang" <de34 at live.cn>,Michael Maitland
 <michaeltmaitland at gmail.com>,Kazu Hirata <kazu at google.com>,Kazu Hirata
 <kazu at google.com>,Kazu Hirata <kazu at google.com>,Kazu Hirata <kazu at google.com>
 =?utf-8?q?,?Úvid Green <david.green at arm.com>,Tom Eccles
 <tom.eccles at arm.com>,David Truby <david.truby at arm.com>,David Truby
 <david.truby at arm.com>,Dmitry Vasilyev <dvassiliev at accesssoftek.com>,Boaz
 Brickner <brickner at google.com>,Rahul Joshi <rjoshi at nvidia.com>,Rahul Joshi
 <rjoshi at nvidia.com>,Stefan Pintilie <stefanp at ca.ibm.com>,Brox Chen
 <guochen2 at amd.com>,OverMighty <its.overmighty at gmail.com>,Krystian Stasiowski
 <sdkrystian at gmail.com>,David Spickett <david.spickett at linaro.org>,Jay Foad
 <jay.foad at amd.com>,Jay Foad <jay.foad at amd.com>,Matt Arsenault
 <Matthew.Arsenault at amd.com>
Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/112412 at github.com>


https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/112412

>From 3860e29e0e743c5f411c3023396d1ea07c28da7d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 16 Oct 2024 06:10:18 +0100
Subject: [PATCH 01/82] [VPlan] Mark VPVectorPointerRecipe as not having
 sideeffects.

VectorPointer doesn't read from memory or have any sideeffects. Mark it
accordingly.
---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp              | 3 +++
 .../Transforms/LoopVectorize/AArch64/induction-costs.ll     | 2 --
 .../RISCV/first-order-recurrence-scalable-vf1.ll            | 1 -
 llvm/test/Transforms/LoopVectorize/X86/cost-model.ll        | 6 +-----
 llvm/test/Transforms/LoopVectorize/dead_instructions.ll     | 3 ---
 llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll    | 6 ------
 6 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 0f90166528a465..6fe30356e8c912 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -87,6 +87,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
   case VPBlendSC:
   case VPReductionEVLSC:
   case VPReductionSC:
+  case VPVectorPointerSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
   case VPWidenGEPSC:
@@ -132,6 +133,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
   case VPBlendSC:
   case VPReductionEVLSC:
   case VPReductionSC:
+  case VPVectorPointerSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
   case VPWidenGEPSC:
@@ -170,6 +172,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   case VPReductionEVLSC:
   case VPReductionSC:
   case VPScalarIVStepsSC:
+  case VPVectorPointerSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
   case VPWidenGEPSC:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index 8080c3a9ba0a7d..36eee8d0c98cea 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -110,7 +110,6 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 2
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x i32>, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[TMP9]] = zext <2 x i32> [[WIDE_LOAD4]] to <2 x i64>
@@ -169,7 +168,6 @@ define i64 @int_and_pointer_iv(ptr %start, i32 %N) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5]] = zext <4 x i32> [[WIDE_LOAD3]] to <4 x i64>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
index 2fd00d67a43e6c..f4dfdacac1b321 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
@@ -18,7 +18,6 @@ define i64 @pr97452_scalable_vf1_for(ptr %src) #0 {
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD1:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
 ; CHECK-NEXT:    [[WIDE_LOAD1]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 8e5bf27acc64f1..73647919aac360 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -858,10 +858,6 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 0, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 4
 ; CHECK-NEXT:    [[TMP11]] = and <4 x i32> [[VEC_PHI]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP12]] = and <4 x i32> [[VEC_PHI1]], [[TMP2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -879,7 +875,7 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[IV]]
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    [[L_AND:%.*]] = and i32 [[L]], 3
 ; CHECK-NEXT:    store i32 [[L_AND]], ptr [[DST]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/dead_instructions.ll b/llvm/test/Transforms/LoopVectorize/dead_instructions.ll
index aae9393bbe0dd2..8b5dd4211d8505 100644
--- a/llvm/test/Transforms/LoopVectorize/dead_instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/dead_instructions.ll
@@ -159,9 +159,6 @@ define void @dead_load_and_vector_pointer(ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[WIDE_LOAD2]], <i32 1, i32 1>
 ; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr [[TMP4]], align 4, !alias.scope [[META6]], !noalias [[META9]]
 ; CHECK-NEXT:    store <2 x i32> [[TMP7]], ptr [[TMP5]], align 4, !alias.scope [[META6]], !noalias [[META9]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll b/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll
index 3d2c2e5e9b5761..7a5a8bfb1a9956 100644
--- a/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll
+++ b/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll
@@ -19,12 +19,6 @@ define i64 @foo(ptr %p1, ptr %p2, i64 %start, i64 %end) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[START2]], [[INDEX]]
-; CHECK-NEXT:    [[IND:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[IND]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[IND]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]

>From 00cd1a06daa7f950cf0954c7f9fafc371c255639 Mon Sep 17 00:00:00 2001
From: Thomas Fransham <tfransham at gmail.com>
Date: Wed, 16 Oct 2024 06:41:33 +0100
Subject: [PATCH 02/82] Update llvm::Registry to work for LLVM shared library
 builds on windows (#109024)

This is part of the effort to support for enabling plugins on windows by
adding better support for building llvm and clang as a DLL.

Since windows doesn't implicitly import and merge exported symbols
across shared libraries like other platforms we need to explicitly add a
extern template declaration for each instantiation of llvm::Registry to
force the registry symbols to be dllimport'ed.
I've added a new visibility macro that doesn't switch between dllimport
and dllexport on windows since the existing macro would be in the wrong
mode for llvm::Registry's declared in Clang. This PR also depends Clang
symbol visibility macros that will be added by #108276

---------

Co-authored-by: Saleem Abdulrasool <compnerd at compnerd.org>
---
 clang/include/clang/Basic/ParsedAttrInfo.h    |  5 ++
 .../clang/Frontend/FrontendPluginRegistry.h   |  5 ++
 clang/include/clang/Lex/Preprocessor.h        |  5 ++
 .../CompilationDatabasePluginRegistry.h       |  6 ++
 .../Tooling/ToolExecutorPluginRegistry.h      |  6 ++
 llvm/include/llvm/CodeGen/GCMetadataPrinter.h |  2 +
 llvm/include/llvm/IR/GCStrategy.h             |  2 +
 llvm/include/llvm/Support/Compiler.h          | 11 ++++
 llvm/include/llvm/Support/Registry.h          | 64 ++++++++++---------
 9 files changed, 76 insertions(+), 30 deletions(-)

diff --git a/clang/include/clang/Basic/ParsedAttrInfo.h b/clang/include/clang/Basic/ParsedAttrInfo.h
index fab5c6f1377d27..3b5f5d3c3f92ac 100644
--- a/clang/include/clang/Basic/ParsedAttrInfo.h
+++ b/clang/include/clang/Basic/ParsedAttrInfo.h
@@ -17,6 +17,7 @@
 
 #include "clang/Basic/AttrSubjectMatchRules.h"
 #include "clang/Basic/AttributeCommonInfo.h"
+#include "clang/Support/Compiler.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Registry.h"
 #include <climits>
@@ -175,4 +176,8 @@ const std::list<std::unique_ptr<ParsedAttrInfo>> &getAttributePluginInstances();
 
 } // namespace clang
 
+namespace llvm {
+extern template class CLANG_TEMPLATE_ABI Registry<clang::ParsedAttrInfo>;
+} // namespace llvm
+
 #endif // LLVM_CLANG_BASIC_PARSEDATTRINFO_H
diff --git a/clang/include/clang/Frontend/FrontendPluginRegistry.h b/clang/include/clang/Frontend/FrontendPluginRegistry.h
index 810578534acb45..5eea9c2fd89a32 100644
--- a/clang/include/clang/Frontend/FrontendPluginRegistry.h
+++ b/clang/include/clang/Frontend/FrontendPluginRegistry.h
@@ -14,6 +14,7 @@
 #define LLVM_CLANG_FRONTEND_FRONTENDPLUGINREGISTRY_H
 
 #include "clang/Frontend/FrontendAction.h"
+#include "clang/Support/Compiler.h"
 #include "llvm/Support/Registry.h"
 
 namespace clang {
@@ -23,4 +24,8 @@ using FrontendPluginRegistry = llvm::Registry<PluginASTAction>;
 
 } // namespace clang
 
+namespace llvm {
+extern template class CLANG_TEMPLATE_ABI Registry<clang::PluginASTAction>;
+} // namespace llvm
+
 #endif // LLVM_CLANG_FRONTEND_FRONTENDPLUGINREGISTRY_H
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 4643b0213815f8..92749e4de44b57 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -32,6 +32,7 @@
 #include "clang/Lex/PPEmbedParameters.h"
 #include "clang/Lex/Token.h"
 #include "clang/Lex/TokenLexer.h"
+#include "clang/Support/Compiler.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -3060,4 +3061,8 @@ using PragmaHandlerRegistry = llvm::Registry<PragmaHandler>;
 
 } // namespace clang
 
+namespace llvm {
+extern template class CLANG_TEMPLATE_ABI Registry<clang::PragmaHandler>;
+} // namespace llvm
+
 #endif // LLVM_CLANG_LEX_PREPROCESSOR_H
diff --git a/clang/include/clang/Tooling/CompilationDatabasePluginRegistry.h b/clang/include/clang/Tooling/CompilationDatabasePluginRegistry.h
index 8c58ad926a402a..e6bcac542b0ecb 100644
--- a/clang/include/clang/Tooling/CompilationDatabasePluginRegistry.h
+++ b/clang/include/clang/Tooling/CompilationDatabasePluginRegistry.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CLANG_TOOLING_COMPILATIONDATABASEPLUGINREGISTRY_H
 #define LLVM_CLANG_TOOLING_COMPILATIONDATABASEPLUGINREGISTRY_H
 
+#include "clang/Support/Compiler.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "llvm/Support/Registry.h"
 
@@ -42,4 +43,9 @@ using CompilationDatabasePluginRegistry =
 } // namespace tooling
 } // namespace clang
 
+namespace llvm {
+extern template class CLANG_TEMPLATE_ABI
+    Registry<clang::tooling::CompilationDatabasePlugin>;
+} // namespace llvm
+
 #endif // LLVM_CLANG_TOOLING_COMPILATIONDATABASEPLUGINREGISTRY_H
diff --git a/clang/include/clang/Tooling/ToolExecutorPluginRegistry.h b/clang/include/clang/Tooling/ToolExecutorPluginRegistry.h
index 5304ff26252def..8d54583234684e 100644
--- a/clang/include/clang/Tooling/ToolExecutorPluginRegistry.h
+++ b/clang/include/clang/Tooling/ToolExecutorPluginRegistry.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CLANG_TOOLING_TOOLEXECUTORPLUGINREGISTRY_H
 #define LLVM_CLANG_TOOLING_TOOLEXECUTORPLUGINREGISTRY_H
 
+#include "clang/Support/Compiler.h"
 #include "clang/Tooling/Execution.h"
 #include "llvm/Support/Registry.h"
 
@@ -20,4 +21,9 @@ using ToolExecutorPluginRegistry = llvm::Registry<ToolExecutorPlugin>;
 } // namespace tooling
 } // namespace clang
 
+namespace llvm {
+extern template class CLANG_TEMPLATE_ABI
+    Registry<clang::tooling::ToolExecutorPlugin>;
+} // namespace llvm
+
 #endif // LLVM_CLANG_TOOLING_TOOLEXECUTORPLUGINREGISTRY_H
diff --git a/llvm/include/llvm/CodeGen/GCMetadataPrinter.h b/llvm/include/llvm/CodeGen/GCMetadataPrinter.h
index f9527c9f8752e9..9d421be8313f01 100644
--- a/llvm/include/llvm/CodeGen/GCMetadataPrinter.h
+++ b/llvm/include/llvm/CodeGen/GCMetadataPrinter.h
@@ -34,6 +34,8 @@ class StackMaps;
 /// defaults from Registry.
 using GCMetadataPrinterRegistry = Registry<GCMetadataPrinter>;
 
+extern template class LLVM_TEMPLATE_ABI Registry<GCMetadataPrinter>;
+
 /// GCMetadataPrinter - Emits GC metadata as assembly code.  Instances are
 /// created, managed, and owned by the AsmPrinter.
 class GCMetadataPrinter {
diff --git a/llvm/include/llvm/IR/GCStrategy.h b/llvm/include/llvm/IR/GCStrategy.h
index 3186465f001812..cbfbe23aaa0683 100644
--- a/llvm/include/llvm/IR/GCStrategy.h
+++ b/llvm/include/llvm/IR/GCStrategy.h
@@ -141,6 +141,8 @@ class GCStrategy {
 /// GCMetadataPrinterRegistery as well.
 using GCRegistry = Registry<GCStrategy>;
 
+extern template class LLVM_TEMPLATE_ABI Registry<GCStrategy>;
+
 /// Lookup the GCStrategy object associated with the given gc name.
 std::unique_ptr<GCStrategy> getGCStrategy(const StringRef Name);
 
diff --git a/llvm/include/llvm/Support/Compiler.h b/llvm/include/llvm/Support/Compiler.h
index 1d2d751d4dc11a..ab0cbff43d749c 100644
--- a/llvm/include/llvm/Support/Compiler.h
+++ b/llvm/include/llvm/Support/Compiler.h
@@ -153,6 +153,12 @@
 /// exported when llvm is built as a shared library with everything else that is
 /// unannotated will have internal visibility.
 ///
+/// LLVM_ABI_EXPORT is for the special case for things like plugin symbol
+/// declarations or definitions where we don't want the macro to be switching
+/// between dllexport and dllimport on windows based on what codebase is being
+/// built, it will only be dllexport. For non windows platforms this macro
+/// behaves the same as LLVM_ABI.
+///
 /// LLVM_EXPORT_TEMPLATE is used on explicit template instantiations in source
 /// files that were declared extern in a header. This macro is only set as a
 /// compiler export attribute on windows, on other platforms it does nothing.
@@ -179,6 +185,7 @@
 #define LLVM_ABI
 #define LLVM_TEMPLATE_ABI
 #define LLVM_EXPORT_TEMPLATE
+#define LLVM_ABI_EXPORT
 #elif defined(_WIN32) && !defined(__MINGW32__)
 #if defined(LLVM_EXPORTS)
 #define LLVM_ABI __declspec(dllexport)
@@ -189,19 +196,23 @@
 #define LLVM_TEMPLATE_ABI __declspec(dllimport)
 #define LLVM_EXPORT_TEMPLATE
 #endif
+#define LLVM_ABI_EXPORT __declspec(dllexport)
 #elif defined(__ELF__) || defined(__MINGW32__) || defined(_AIX)
 #define LLVM_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #define LLVM_TEMPLATE_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #define LLVM_EXPORT_TEMPLATE
+#define LLVM_ABI_EXPORT LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #elif defined(__MACH__) || defined(__WASM__)
 #define LLVM_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #define LLVM_TEMPLATE_ABI
 #define LLVM_EXPORT_TEMPLATE
+#define LLVM_ABI_EXPORT LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #endif
 #else
 #define LLVM_ABI
 #define LLVM_TEMPLATE_ABI
 #define LLVM_EXPORT_TEMPLATE
+#define LLVM_ABI_EXPORT
 #endif
 #define LLVM_C_ABI LLVM_ABI
 #endif
diff --git a/llvm/include/llvm/Support/Registry.h b/llvm/include/llvm/Support/Registry.h
index 5bb6a254a47f4c..ff9226c39359c5 100644
--- a/llvm/include/llvm/Support/Registry.h
+++ b/llvm/include/llvm/Support/Registry.h
@@ -53,7 +53,13 @@ namespace llvm {
     Registry() = delete;
 
     friend class node;
-    static node *Head, *Tail;
+    // These must be must two separate declarations to workaround a 20 year
+    // old MSVC bug with dllexport and multiple static fields in the same
+    // declaration causing error C2487 "member of dll interface class may not
+    // be declared with dll interface".
+    // https://developercommunity.visualstudio.com/t/c2487-in-dllexport-class-with-static-members/69878
+    static node *Head;
+    static node *Tail;
 
   public:
     /// Node in linked list of entries.
@@ -76,7 +82,13 @@ namespace llvm {
     /// add a node to the executable's registry. Therefore it's not defined here
     /// to avoid it being instantiated in the plugin and is instead defined in
     /// the executable (see LLVM_INSTANTIATE_REGISTRY below).
-    static void add_node(node *N);
+    static void add_node(node *N) {
+      if (Tail)
+        Tail->Next = N;
+      else
+        Head = N;
+      Tail = N;
+    }
 
     /// Iterators for registry entries.
     ///
@@ -95,7 +107,7 @@ namespace llvm {
 
     // begin is not defined here in order to avoid usage of an undefined static
     // data member, instead it's instantiated by LLVM_INSTANTIATE_REGISTRY.
-    static iterator begin();
+    static iterator begin() { return iterator(Head); }
     static iterator end()   { return iterator(nullptr); }
 
     static iterator_range<iterator> entries() {
@@ -124,36 +136,28 @@ namespace llvm {
       }
     };
   };
+
 } // end namespace llvm
 
+#ifdef _WIN32
 /// Instantiate a registry class.
-///
-/// This provides template definitions of add_node, begin, and the Head and Tail
-/// pointers, then explicitly instantiates them. We could explicitly specialize
-/// them, instead of the two-step process of define then instantiate, but
-/// strictly speaking that's not allowed by the C++ standard (we would need to
-/// have explicit specialization declarations in all translation units where the
-/// specialization is used) so we don't.
-#define LLVM_INSTANTIATE_REGISTRY(REGISTRY_CLASS) \
-  namespace llvm { \
-  template<typename T> typename Registry<T>::node *Registry<T>::Head = nullptr;\
-  template<typename T> typename Registry<T>::node *Registry<T>::Tail = nullptr;\
-  template<typename T> \
-  void Registry<T>::add_node(typename Registry<T>::node *N) { \
-    if (Tail) \
-      Tail->Next = N; \
-    else \
-      Head = N; \
-    Tail = N; \
-  } \
-  template<typename T> typename Registry<T>::iterator Registry<T>::begin() { \
-    return iterator(Head); \
-  } \
-  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Head; \
-  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Tail; \
-  template \
-  void Registry<REGISTRY_CLASS::type>::add_node(REGISTRY_CLASS::node*); \
-  template REGISTRY_CLASS::iterator Registry<REGISTRY_CLASS::type>::begin(); \
+#define LLVM_INSTANTIATE_REGISTRY(REGISTRY_CLASS)                              \
+  namespace llvm {                                                             \
+  template <typename T>                                                        \
+  typename Registry<T>::node *Registry<T>::Head = nullptr;                     \
+  template <typename T>                                                        \
+  typename Registry<T>::node *Registry<T>::Tail = nullptr;                     \
+  template class LLVM_ABI_EXPORT Registry<REGISTRY_CLASS::type>;               \
+  }
+#else
+#define LLVM_INSTANTIATE_REGISTRY(REGISTRY_CLASS)                              \
+  namespace llvm {                                                             \
+  template <typename T>                                                        \
+  typename Registry<T>::node *Registry<T>::Head = nullptr;                     \
+  template <typename T>                                                        \
+  typename Registry<T>::node *Registry<T>::Tail = nullptr;                     \
+  template class Registry<REGISTRY_CLASS::type>;                               \
   }
+#endif
 
 #endif // LLVM_SUPPORT_REGISTRY_H

>From 682925ef43902282173dd9e27cc4a5cc7b794821 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Tue, 15 Oct 2024 22:58:07 -0700
Subject: [PATCH 03/82] [ELF] Pass Ctx & to Partition

---
 lld/ELF/Driver.cpp            |  4 +-
 lld/ELF/InputSection.h        |  2 +-
 lld/ELF/Relocations.cpp       |  4 +-
 lld/ELF/SyntheticSections.cpp | 78 +++++++++++++++++------------------
 lld/ELF/SyntheticSections.h   |  4 +-
 lld/ELF/Writer.cpp            |  4 +-
 6 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index afacbbfe9099cc..fb77e67e9fc5ca 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -2471,7 +2471,7 @@ static void readSymbolPartitionSection(Ctx &ctx, InputSectionBase *s) {
   StringRef partName = reinterpret_cast<const char *>(s->content().data());
   for (Partition &part : ctx.partitions) {
     if (part.name == partName) {
-      sym->partition = part.getNumber();
+      sym->partition = part.getNumber(ctx);
       return;
     }
   }
@@ -2500,7 +2500,7 @@ static void readSymbolPartitionSection(Ctx &ctx, InputSectionBase *s) {
   ctx.partitions.emplace_back(ctx);
   Partition &newPart = ctx.partitions.back();
   newPart.name = partName;
-  sym->partition = newPart.getNumber();
+  sym->partition = newPart.getNumber(ctx);
 }
 
 static void markBuffersAsDontNeed(Ctx &ctx, bool skipLinkedOutput) {
diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h
index 1a5bc629d8b093..dfcc7c8bc852a9 100644
--- a/lld/ELF/InputSection.h
+++ b/lld/ELF/InputSection.h
@@ -89,7 +89,7 @@ class SectionBase {
   // The 1-indexed partition that this section is assigned to by the garbage
   // collector, or 0 if this section is dead. Normally there is only one
   // partition, so this will either be 0 or 1.
-  elf::Partition &getPartition() const;
+  elf::Partition &getPartition(Ctx &) const;
 
   // These corresponds to the fields in Elf_Shdr.
   uint64_t flags;
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 73c19c8385a824..0188d658f9210e 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -878,7 +878,7 @@ template <bool shard = false>
 static void addRelativeReloc(Ctx &ctx, InputSectionBase &isec,
                              uint64_t offsetInSec, Symbol &sym, int64_t addend,
                              RelExpr expr, RelType type) {
-  Partition &part = isec.getPartition();
+  Partition &part = isec.getPartition(ctx);
 
   if (sym.isTagged()) {
     std::lock_guard<std::mutex> lock(relocMutex);
@@ -1159,7 +1159,7 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
       if (ctx.arg.emachine == EM_MIPS && rel == ctx.target->symbolicRel)
         rel = ctx.target->relativeRel;
       std::lock_guard<std::mutex> lock(relocMutex);
-      Partition &part = sec->getPartition();
+      Partition &part = sec->getPartition(ctx);
       if (ctx.arg.emachine == EM_AARCH64 && type == R_AARCH64_AUTH_ABS64) {
         // For a preemptible symbol, we can't use a relative relocation. For an
         // undefined symbol, we can't compute offset at link-time and use a
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index d99aeac50ca5ce..e18e7a32df86c7 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -563,7 +563,7 @@ SmallVector<EhFrameSection::FdeData, 0> EhFrameSection::getFdeData() const {
   uint8_t *buf = ctx.bufferStart + getParent()->offset + outSecOff;
   SmallVector<FdeData, 0> ret;
 
-  uint64_t va = getPartition().ehFrameHdr->getVA();
+  uint64_t va = getPartition(ctx).ehFrameHdr->getVA();
   for (CieRecord *rec : cieRecords) {
     uint8_t enc = getFdeEncoding(rec->cie);
     for (EhSectionPiece *fde : rec->fdes) {
@@ -650,8 +650,8 @@ void EhFrameSection::writeTo(uint8_t *buf) {
   for (EhInputSection *s : sections)
     ctx.target->relocateAlloc(*s, buf);
 
-  if (getPartition().ehFrameHdr && getPartition().ehFrameHdr->getParent())
-    getPartition().ehFrameHdr->write();
+  if (getPartition(ctx).ehFrameHdr && getPartition(ctx).ehFrameHdr->getParent())
+    getPartition(ctx).ehFrameHdr->write();
 }
 
 GotSection::GotSection(Ctx &ctx)
@@ -1325,7 +1325,7 @@ static uint64_t addPltRelSz(Ctx &ctx) { return ctx.in.relaPlt->getSize(); }
 template <class ELFT>
 std::vector<std::pair<int32_t, uint64_t>>
 DynamicSection<ELFT>::computeContents() {
-  elf::Partition &part = getPartition();
+  elf::Partition &part = getPartition(ctx);
   bool isMain = part.name.empty();
   std::vector<std::pair<int32_t, uint64_t>> entries;
 
@@ -1586,7 +1586,7 @@ DynamicSection<ELFT>::computeContents() {
 }
 
 template <class ELFT> void DynamicSection<ELFT>::finalizeContents() {
-  if (OutputSection *sec = getPartition().dynStrTab->getParent())
+  if (OutputSection *sec = getPartition(ctx).dynStrTab->getParent())
     getParent()->link = sec->sectionIndex;
   this->size = computeContents().size() * this->entsize;
 }
@@ -1688,7 +1688,7 @@ void RelocationBaseSection::partitionRels() {
 }
 
 void RelocationBaseSection::finalizeContents() {
-  SymbolTableBaseSection *symTab = getPartition().dynSymTab.get();
+  SymbolTableBaseSection *symTab = getPartition(ctx).dynSymTab.get();
 
   // When linking glibc statically, .rel{,a}.plt contains R_*_IRELATIVE
   // relocations due to IFUNC (e.g. strcpy). sh_link will be set to 0 in that
@@ -1712,7 +1712,7 @@ void DynamicReloc::computeRaw(Ctx &ctx, SymbolTableBaseSection *symt) {
 }
 
 void RelocationBaseSection::computeRels() {
-  SymbolTableBaseSection *symTab = getPartition().dynSymTab.get();
+  SymbolTableBaseSection *symTab = getPartition(ctx).dynSymTab.get();
   parallelForEach(relocs, [&ctx = ctx, symTab](DynamicReloc &rel) {
     rel.computeRaw(ctx, symTab);
   });
@@ -1852,7 +1852,7 @@ bool AndroidPackedRelocationSection<ELFT>::updateAllocSize(Ctx &ctx) {
   for (const DynamicReloc &rel : relocs) {
     Elf_Rela r;
     r.r_offset = rel.getOffset();
-    r.setSymbolAndType(rel.getSymIndex(getPartition().dynSymTab.get()),
+    r.setSymbolAndType(rel.getSymIndex(getPartition(ctx).dynSymTab.get()),
                        rel.type, false);
     r.r_addend = ctx.arg.isRela ? rel.computeAddend(ctx) : 0;
 
@@ -2162,9 +2162,9 @@ void SymbolTableBaseSection::finalizeContents() {
   // Because the first symbol entry is a null entry, 1 is the first.
   getParent()->info = 1;
 
-  if (getPartition().gnuHashTab) {
+  if (getPartition(ctx).gnuHashTab) {
     // NB: It also sorts Symbols to meet the GNU hash table requirements.
-    getPartition().gnuHashTab->addSymbols(symbols);
+    getPartition(ctx).gnuHashTab->addSymbols(symbols);
   } else if (ctx.arg.emachine == EM_MIPS) {
     sortMipsSymbols(ctx, symbols);
   }
@@ -2416,7 +2416,7 @@ GnuHashTableSection::GnuHashTableSection(Ctx &ctx)
                        ".gnu.hash") {}
 
 void GnuHashTableSection::finalizeContents() {
-  if (OutputSection *sec = getPartition().dynSymTab->getParent())
+  if (OutputSection *sec = getPartition(ctx).dynSymTab->getParent())
     getParent()->link = sec->sectionIndex;
 
   // Computes bloom filter size in word size. We want to allocate 12
@@ -2438,7 +2438,7 @@ void GnuHashTableSection::writeTo(uint8_t *buf) {
   // Write a header.
   write32(ctx, buf, nBuckets);
   write32(ctx, buf + 4,
-          getPartition().dynSymTab->getNumSymbols() - symbols.size());
+          getPartition(ctx).dynSymTab->getNumSymbols() - symbols.size());
   write32(ctx, buf + 8, maskWords);
   write32(ctx, buf + 12, Shift2);
   buf += 16;
@@ -2474,7 +2474,7 @@ void GnuHashTableSection::writeTo(uint8_t *buf) {
     // Write a hash bucket. Hash buckets contain indices in the following hash
     // value table.
     write32(ctx, buckets + i->bucketIdx,
-            getPartition().dynSymTab->getSymbolIndex(*i->sym));
+            getPartition(ctx).dynSymTab->getSymbolIndex(*i->sym));
     oldBucket = i->bucketIdx;
   }
 }
@@ -2527,7 +2527,7 @@ HashTableSection::HashTableSection(Ctx &ctx)
 }
 
 void HashTableSection::finalizeContents() {
-  SymbolTableBaseSection *symTab = getPartition().dynSymTab.get();
+  SymbolTableBaseSection *symTab = getPartition(ctx).dynSymTab.get();
 
   if (OutputSection *sec = symTab->getParent())
     getParent()->link = sec->sectionIndex;
@@ -2541,7 +2541,7 @@ void HashTableSection::finalizeContents() {
 }
 
 void HashTableSection::writeTo(uint8_t *buf) {
-  SymbolTableBaseSection *symTab = getPartition().dynSymTab.get();
+  SymbolTableBaseSection *symTab = getPartition(ctx).dynSymTab.get();
   unsigned numSymbols = symTab->getNumSymbols();
 
   uint32_t *p = reinterpret_cast<uint32_t *>(buf);
@@ -3667,14 +3667,14 @@ void EhFrameHeader::writeTo(uint8_t *buf) {
 void EhFrameHeader::write() {
   uint8_t *buf = ctx.bufferStart + getParent()->offset + outSecOff;
   using FdeData = EhFrameSection::FdeData;
-  SmallVector<FdeData, 0> fdes = getPartition().ehFrame->getFdeData();
+  SmallVector<FdeData, 0> fdes = getPartition(ctx).ehFrame->getFdeData();
 
   buf[0] = 1;
   buf[1] = DW_EH_PE_pcrel | DW_EH_PE_sdata4;
   buf[2] = DW_EH_PE_udata4;
   buf[3] = DW_EH_PE_datarel | DW_EH_PE_sdata4;
   write32(ctx, buf + 4,
-          getPartition().ehFrame->getParent()->addr - this->getVA() - 4);
+          getPartition(ctx).ehFrame->getParent()->addr - this->getVA() - 4);
   write32(ctx, buf + 8, fdes.size());
   buf += 12;
 
@@ -3687,11 +3687,11 @@ void EhFrameHeader::write() {
 
 size_t EhFrameHeader::getSize() const {
   // .eh_frame_hdr has a 12 bytes header followed by an array of FDEs.
-  return 12 + getPartition().ehFrame->numFdes * 8;
+  return 12 + getPartition(ctx).ehFrame->numFdes * 8;
 }
 
 bool EhFrameHeader::isNeeded() const {
-  return isLive() && getPartition().ehFrame->isNeeded();
+  return isLive() && getPartition(ctx).ehFrame->isNeeded();
 }
 
 VersionDefinitionSection::VersionDefinitionSection(Ctx &ctx)
@@ -3699,19 +3699,19 @@ VersionDefinitionSection::VersionDefinitionSection(Ctx &ctx)
                        ".gnu.version_d") {}
 
 StringRef VersionDefinitionSection::getFileDefName() {
-  if (!getPartition().name.empty())
-    return getPartition().name;
+  if (!getPartition(ctx).name.empty())
+    return getPartition(ctx).name;
   if (!ctx.arg.soName.empty())
     return ctx.arg.soName;
   return ctx.arg.outputFile;
 }
 
 void VersionDefinitionSection::finalizeContents() {
-  fileDefNameOff = getPartition().dynStrTab->addString(getFileDefName());
+  fileDefNameOff = getPartition(ctx).dynStrTab->addString(getFileDefName());
   for (const VersionDefinition &v : namedVersionDefs(ctx))
-    verDefNameOffs.push_back(getPartition().dynStrTab->addString(v.name));
+    verDefNameOffs.push_back(getPartition(ctx).dynStrTab->addString(v.name));
 
-  if (OutputSection *sec = getPartition().dynStrTab->getParent())
+  if (OutputSection *sec = getPartition(ctx).dynStrTab->getParent())
     getParent()->link = sec->sectionIndex;
 
   // sh_info should be set to the number of definitions. This fact is missed in
@@ -3765,16 +3765,16 @@ VersionTableSection::VersionTableSection(Ctx &ctx)
 void VersionTableSection::finalizeContents() {
   // At the moment of june 2016 GNU docs does not mention that sh_link field
   // should be set, but Sun docs do. Also readelf relies on this field.
-  getParent()->link = getPartition().dynSymTab->getParent()->sectionIndex;
+  getParent()->link = getPartition(ctx).dynSymTab->getParent()->sectionIndex;
 }
 
 size_t VersionTableSection::getSize() const {
-  return (getPartition().dynSymTab->getSymbols().size() + 1) * 2;
+  return (getPartition(ctx).dynSymTab->getSymbols().size() + 1) * 2;
 }
 
 void VersionTableSection::writeTo(uint8_t *buf) {
   buf += 2;
-  for (const SymbolTableEntry &s : getPartition().dynSymTab->getSymbols()) {
+  for (const SymbolTableEntry &s : getPartition(ctx).dynSymTab->getSymbols()) {
     // For an unextracted lazy symbol (undefined weak), it must have been
     // converted to Undefined and have VER_NDX_GLOBAL version here.
     assert(!s.sym->isLazy());
@@ -3785,7 +3785,7 @@ void VersionTableSection::writeTo(uint8_t *buf) {
 
 bool VersionTableSection::isNeeded() const {
   return isLive() &&
-         (getPartition().verDef || getPartition().verNeed->isNeeded());
+         (getPartition(ctx).verDef || getPartition(ctx).verNeed->isNeeded());
 }
 
 void elf::addVerneed(Ctx &ctx, Symbol &ss) {
@@ -3817,7 +3817,7 @@ template <class ELFT> void VersionNeedSection<ELFT>::finalizeContents() {
       continue;
     verneeds.emplace_back();
     Verneed &vn = verneeds.back();
-    vn.nameStrTab = getPartition().dynStrTab->addString(f->soName);
+    vn.nameStrTab = getPartition(ctx).dynStrTab->addString(f->soName);
     bool isLibc = ctx.arg.relrGlibc && f->soName.starts_with("libc.so.");
     bool isGlibc2 = false;
     for (unsigned i = 0; i != f->vernauxs.size(); ++i) {
@@ -3829,17 +3829,17 @@ template <class ELFT> void VersionNeedSection<ELFT>::finalizeContents() {
       if (isLibc && ver.starts_with("GLIBC_2."))
         isGlibc2 = true;
       vn.vernauxs.push_back({verdef->vd_hash, f->vernauxs[i],
-                             getPartition().dynStrTab->addString(ver)});
+                             getPartition(ctx).dynStrTab->addString(ver)});
     }
     if (isGlibc2) {
       const char *ver = "GLIBC_ABI_DT_RELR";
       vn.vernauxs.push_back({hashSysV(ver),
                              ++SharedFile::vernauxNum + getVerDefNum(ctx),
-                             getPartition().dynStrTab->addString(ver)});
+                             getPartition(ctx).dynStrTab->addString(ver)});
     }
   }
 
-  if (OutputSection *sec = getPartition().dynStrTab->getParent())
+  if (OutputSection *sec = getPartition(ctx).dynStrTab->getParent())
     getParent()->link = sec->sectionIndex;
   getParent()->info = verneeds.size();
 }
@@ -3995,7 +3995,7 @@ template <class ELFT> void elf::splitSections(Ctx &ctx) {
 void elf::combineEhSections(Ctx &ctx) {
   llvm::TimeTraceScope timeScope("Combine EH sections");
   for (EhInputSection *sec : ctx.ehInputSections) {
-    EhFrameSection &eh = *sec->getPartition().ehFrame;
+    EhFrameSection &eh = *sec->getPartition(ctx).ehFrame;
     sec->parent = &eh;
     eh.addralign = std::max(eh.addralign, sec->addralign);
     eh.sections.push_back(sec);
@@ -4004,12 +4004,12 @@ void elf::combineEhSections(Ctx &ctx) {
 
   if (!ctx.mainPart->armExidx)
     return;
-  llvm::erase_if(ctx.inputSections, [](InputSectionBase *s) {
+  llvm::erase_if(ctx.inputSections, [&](InputSectionBase *s) {
     // Ignore dead sections and the partition end marker (.part.end),
     // whose partition number is out of bounds.
     if (!s->isLive() || s->partition == 255)
       return false;
-    Partition &part = s->getPartition();
+    Partition &part = s->getPartition(ctx);
     return s->kind() == SectionBase::Regular && part.armExidx &&
            part.armExidx->addSection(cast<InputSection>(s));
   });
@@ -4447,7 +4447,7 @@ size_t PartitionElfHeaderSection<ELFT>::getSize() const {
 
 template <typename ELFT>
 void PartitionElfHeaderSection<ELFT>::writeTo(uint8_t *buf) {
-  writeEhdr<ELFT>(ctx, buf, getPartition());
+  writeEhdr<ELFT>(ctx, buf, getPartition(ctx));
 
   // Loadable partitions are always ET_DYN.
   auto *eHdr = reinterpret_cast<typename ELFT::Ehdr *>(buf);
@@ -4460,12 +4460,12 @@ PartitionProgramHeadersSection<ELFT>::PartitionProgramHeadersSection(Ctx &ctx)
 
 template <typename ELFT>
 size_t PartitionProgramHeadersSection<ELFT>::getSize() const {
-  return sizeof(typename ELFT::Phdr) * getPartition().phdrs.size();
+  return sizeof(typename ELFT::Phdr) * getPartition(ctx).phdrs.size();
 }
 
 template <typename ELFT>
 void PartitionProgramHeadersSection<ELFT>::writeTo(uint8_t *buf) {
-  writePhdrs<ELFT>(buf, getPartition());
+  writePhdrs<ELFT>(buf, getPartition(ctx));
 }
 
 PartitionIndexSection::PartitionIndexSection(Ctx &ctx)
@@ -4747,7 +4747,7 @@ template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
   const unsigned threadCount = ctx.arg.threadCount;
   for (Partition &part : ctx.partitions) {
     auto add = [&](SyntheticSection &sec) {
-      sec.partition = part.getNumber();
+      sec.partition = part.getNumber(ctx);
       ctx.inputSections.push_back(&sec);
     };
 
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index 7ddbaff5573fdb..d64c4aad8c552b 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -1475,10 +1475,10 @@ struct Partition {
   std::unique_ptr<VersionTableSection> verSym;
 
   Partition(Ctx &ctx) : ctx(ctx) {}
-  unsigned getNumber() const { return this - &ctx.partitions[0] + 1; }
+  unsigned getNumber(Ctx &ctx) const { return this - &ctx.partitions[0] + 1; }
 };
 
-inline Partition &SectionBase::getPartition() const {
+inline Partition &SectionBase::getPartition(Ctx &ctx) const {
   assert(isLive());
   return ctx.partitions[partition - 1];
 }
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 49b6ab4a48b55a..2cd4478d00cf5d 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -2191,7 +2191,7 @@ SmallVector<PhdrEntry *, 0> Writer<ELFT>::createPhdrs(Partition &part) {
     return ret.back();
   };
 
-  unsigned partNo = part.getNumber();
+  unsigned partNo = part.getNumber(ctx);
   bool isMain = partNo == 1;
 
   // Add the first PT_LOAD segment for regular output sections.
@@ -2381,7 +2381,7 @@ SmallVector<PhdrEntry *, 0> Writer<ELFT>::createPhdrs(Partition &part) {
 template <class ELFT>
 void Writer<ELFT>::addPhdrForSection(Partition &part, unsigned shType,
                                      unsigned pType, unsigned pFlags) {
-  unsigned partNo = part.getNumber();
+  unsigned partNo = part.getNumber(ctx);
   auto i = llvm::find_if(ctx.outputSections, [=](OutputSection *cmd) {
     return cmd->partition == partNo && cmd->type == shType;
   });

>From b8882be26f00d2a053269948ee6ecaeff8db8eb8 Mon Sep 17 00:00:00 2001
From: Vassil Vassilev <v.g.vassilev at gmail.com>
Date: Wed, 16 Oct 2024 06:03:45 +0000
Subject: [PATCH 04/82] Revert "Update llvm::Registry to work for LLVM shared
 library builds on windows (#109024)"

This reverts commit 00cd1a06daa7f950cf0954c7f9fafc371c255639.

This effectively reverts llvm/llvm-project#109024
---
 clang/include/clang/Basic/ParsedAttrInfo.h    |  5 --
 .../clang/Frontend/FrontendPluginRegistry.h   |  5 --
 clang/include/clang/Lex/Preprocessor.h        |  5 --
 .../CompilationDatabasePluginRegistry.h       |  6 --
 .../Tooling/ToolExecutorPluginRegistry.h      |  6 --
 llvm/include/llvm/CodeGen/GCMetadataPrinter.h |  2 -
 llvm/include/llvm/IR/GCStrategy.h             |  2 -
 llvm/include/llvm/Support/Compiler.h          | 11 ----
 llvm/include/llvm/Support/Registry.h          | 64 +++++++++----------
 9 files changed, 30 insertions(+), 76 deletions(-)

diff --git a/clang/include/clang/Basic/ParsedAttrInfo.h b/clang/include/clang/Basic/ParsedAttrInfo.h
index 3b5f5d3c3f92ac..fab5c6f1377d27 100644
--- a/clang/include/clang/Basic/ParsedAttrInfo.h
+++ b/clang/include/clang/Basic/ParsedAttrInfo.h
@@ -17,7 +17,6 @@
 
 #include "clang/Basic/AttrSubjectMatchRules.h"
 #include "clang/Basic/AttributeCommonInfo.h"
-#include "clang/Support/Compiler.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Registry.h"
 #include <climits>
@@ -176,8 +175,4 @@ const std::list<std::unique_ptr<ParsedAttrInfo>> &getAttributePluginInstances();
 
 } // namespace clang
 
-namespace llvm {
-extern template class CLANG_TEMPLATE_ABI Registry<clang::ParsedAttrInfo>;
-} // namespace llvm
-
 #endif // LLVM_CLANG_BASIC_PARSEDATTRINFO_H
diff --git a/clang/include/clang/Frontend/FrontendPluginRegistry.h b/clang/include/clang/Frontend/FrontendPluginRegistry.h
index 5eea9c2fd89a32..810578534acb45 100644
--- a/clang/include/clang/Frontend/FrontendPluginRegistry.h
+++ b/clang/include/clang/Frontend/FrontendPluginRegistry.h
@@ -14,7 +14,6 @@
 #define LLVM_CLANG_FRONTEND_FRONTENDPLUGINREGISTRY_H
 
 #include "clang/Frontend/FrontendAction.h"
-#include "clang/Support/Compiler.h"
 #include "llvm/Support/Registry.h"
 
 namespace clang {
@@ -24,8 +23,4 @@ using FrontendPluginRegistry = llvm::Registry<PluginASTAction>;
 
 } // namespace clang
 
-namespace llvm {
-extern template class CLANG_TEMPLATE_ABI Registry<clang::PluginASTAction>;
-} // namespace llvm
-
 #endif // LLVM_CLANG_FRONTEND_FRONTENDPLUGINREGISTRY_H
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 92749e4de44b57..4643b0213815f8 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -32,7 +32,6 @@
 #include "clang/Lex/PPEmbedParameters.h"
 #include "clang/Lex/Token.h"
 #include "clang/Lex/TokenLexer.h"
-#include "clang/Support/Compiler.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -3061,8 +3060,4 @@ using PragmaHandlerRegistry = llvm::Registry<PragmaHandler>;
 
 } // namespace clang
 
-namespace llvm {
-extern template class CLANG_TEMPLATE_ABI Registry<clang::PragmaHandler>;
-} // namespace llvm
-
 #endif // LLVM_CLANG_LEX_PREPROCESSOR_H
diff --git a/clang/include/clang/Tooling/CompilationDatabasePluginRegistry.h b/clang/include/clang/Tooling/CompilationDatabasePluginRegistry.h
index e6bcac542b0ecb..8c58ad926a402a 100644
--- a/clang/include/clang/Tooling/CompilationDatabasePluginRegistry.h
+++ b/clang/include/clang/Tooling/CompilationDatabasePluginRegistry.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_CLANG_TOOLING_COMPILATIONDATABASEPLUGINREGISTRY_H
 #define LLVM_CLANG_TOOLING_COMPILATIONDATABASEPLUGINREGISTRY_H
 
-#include "clang/Support/Compiler.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "llvm/Support/Registry.h"
 
@@ -43,9 +42,4 @@ using CompilationDatabasePluginRegistry =
 } // namespace tooling
 } // namespace clang
 
-namespace llvm {
-extern template class CLANG_TEMPLATE_ABI
-    Registry<clang::tooling::CompilationDatabasePlugin>;
-} // namespace llvm
-
 #endif // LLVM_CLANG_TOOLING_COMPILATIONDATABASEPLUGINREGISTRY_H
diff --git a/clang/include/clang/Tooling/ToolExecutorPluginRegistry.h b/clang/include/clang/Tooling/ToolExecutorPluginRegistry.h
index 8d54583234684e..5304ff26252def 100644
--- a/clang/include/clang/Tooling/ToolExecutorPluginRegistry.h
+++ b/clang/include/clang/Tooling/ToolExecutorPluginRegistry.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_CLANG_TOOLING_TOOLEXECUTORPLUGINREGISTRY_H
 #define LLVM_CLANG_TOOLING_TOOLEXECUTORPLUGINREGISTRY_H
 
-#include "clang/Support/Compiler.h"
 #include "clang/Tooling/Execution.h"
 #include "llvm/Support/Registry.h"
 
@@ -21,9 +20,4 @@ using ToolExecutorPluginRegistry = llvm::Registry<ToolExecutorPlugin>;
 } // namespace tooling
 } // namespace clang
 
-namespace llvm {
-extern template class CLANG_TEMPLATE_ABI
-    Registry<clang::tooling::ToolExecutorPlugin>;
-} // namespace llvm
-
 #endif // LLVM_CLANG_TOOLING_TOOLEXECUTORPLUGINREGISTRY_H
diff --git a/llvm/include/llvm/CodeGen/GCMetadataPrinter.h b/llvm/include/llvm/CodeGen/GCMetadataPrinter.h
index 9d421be8313f01..f9527c9f8752e9 100644
--- a/llvm/include/llvm/CodeGen/GCMetadataPrinter.h
+++ b/llvm/include/llvm/CodeGen/GCMetadataPrinter.h
@@ -34,8 +34,6 @@ class StackMaps;
 /// defaults from Registry.
 using GCMetadataPrinterRegistry = Registry<GCMetadataPrinter>;
 
-extern template class LLVM_TEMPLATE_ABI Registry<GCMetadataPrinter>;
-
 /// GCMetadataPrinter - Emits GC metadata as assembly code.  Instances are
 /// created, managed, and owned by the AsmPrinter.
 class GCMetadataPrinter {
diff --git a/llvm/include/llvm/IR/GCStrategy.h b/llvm/include/llvm/IR/GCStrategy.h
index cbfbe23aaa0683..3186465f001812 100644
--- a/llvm/include/llvm/IR/GCStrategy.h
+++ b/llvm/include/llvm/IR/GCStrategy.h
@@ -141,8 +141,6 @@ class GCStrategy {
 /// GCMetadataPrinterRegistery as well.
 using GCRegistry = Registry<GCStrategy>;
 
-extern template class LLVM_TEMPLATE_ABI Registry<GCStrategy>;
-
 /// Lookup the GCStrategy object associated with the given gc name.
 std::unique_ptr<GCStrategy> getGCStrategy(const StringRef Name);
 
diff --git a/llvm/include/llvm/Support/Compiler.h b/llvm/include/llvm/Support/Compiler.h
index ab0cbff43d749c..1d2d751d4dc11a 100644
--- a/llvm/include/llvm/Support/Compiler.h
+++ b/llvm/include/llvm/Support/Compiler.h
@@ -153,12 +153,6 @@
 /// exported when llvm is built as a shared library with everything else that is
 /// unannotated will have internal visibility.
 ///
-/// LLVM_ABI_EXPORT is for the special case for things like plugin symbol
-/// declarations or definitions where we don't want the macro to be switching
-/// between dllexport and dllimport on windows based on what codebase is being
-/// built, it will only be dllexport. For non windows platforms this macro
-/// behaves the same as LLVM_ABI.
-///
 /// LLVM_EXPORT_TEMPLATE is used on explicit template instantiations in source
 /// files that were declared extern in a header. This macro is only set as a
 /// compiler export attribute on windows, on other platforms it does nothing.
@@ -185,7 +179,6 @@
 #define LLVM_ABI
 #define LLVM_TEMPLATE_ABI
 #define LLVM_EXPORT_TEMPLATE
-#define LLVM_ABI_EXPORT
 #elif defined(_WIN32) && !defined(__MINGW32__)
 #if defined(LLVM_EXPORTS)
 #define LLVM_ABI __declspec(dllexport)
@@ -196,23 +189,19 @@
 #define LLVM_TEMPLATE_ABI __declspec(dllimport)
 #define LLVM_EXPORT_TEMPLATE
 #endif
-#define LLVM_ABI_EXPORT __declspec(dllexport)
 #elif defined(__ELF__) || defined(__MINGW32__) || defined(_AIX)
 #define LLVM_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #define LLVM_TEMPLATE_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #define LLVM_EXPORT_TEMPLATE
-#define LLVM_ABI_EXPORT LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #elif defined(__MACH__) || defined(__WASM__)
 #define LLVM_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #define LLVM_TEMPLATE_ABI
 #define LLVM_EXPORT_TEMPLATE
-#define LLVM_ABI_EXPORT LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #endif
 #else
 #define LLVM_ABI
 #define LLVM_TEMPLATE_ABI
 #define LLVM_EXPORT_TEMPLATE
-#define LLVM_ABI_EXPORT
 #endif
 #define LLVM_C_ABI LLVM_ABI
 #endif
diff --git a/llvm/include/llvm/Support/Registry.h b/llvm/include/llvm/Support/Registry.h
index ff9226c39359c5..5bb6a254a47f4c 100644
--- a/llvm/include/llvm/Support/Registry.h
+++ b/llvm/include/llvm/Support/Registry.h
@@ -53,13 +53,7 @@ namespace llvm {
     Registry() = delete;
 
     friend class node;
-    // These must be must two separate declarations to workaround a 20 year
-    // old MSVC bug with dllexport and multiple static fields in the same
-    // declaration causing error C2487 "member of dll interface class may not
-    // be declared with dll interface".
-    // https://developercommunity.visualstudio.com/t/c2487-in-dllexport-class-with-static-members/69878
-    static node *Head;
-    static node *Tail;
+    static node *Head, *Tail;
 
   public:
     /// Node in linked list of entries.
@@ -82,13 +76,7 @@ namespace llvm {
     /// add a node to the executable's registry. Therefore it's not defined here
     /// to avoid it being instantiated in the plugin and is instead defined in
     /// the executable (see LLVM_INSTANTIATE_REGISTRY below).
-    static void add_node(node *N) {
-      if (Tail)
-        Tail->Next = N;
-      else
-        Head = N;
-      Tail = N;
-    }
+    static void add_node(node *N);
 
     /// Iterators for registry entries.
     ///
@@ -107,7 +95,7 @@ namespace llvm {
 
     // begin is not defined here in order to avoid usage of an undefined static
     // data member, instead it's instantiated by LLVM_INSTANTIATE_REGISTRY.
-    static iterator begin() { return iterator(Head); }
+    static iterator begin();
     static iterator end()   { return iterator(nullptr); }
 
     static iterator_range<iterator> entries() {
@@ -136,28 +124,36 @@ namespace llvm {
       }
     };
   };
-
 } // end namespace llvm
 
-#ifdef _WIN32
 /// Instantiate a registry class.
-#define LLVM_INSTANTIATE_REGISTRY(REGISTRY_CLASS)                              \
-  namespace llvm {                                                             \
-  template <typename T>                                                        \
-  typename Registry<T>::node *Registry<T>::Head = nullptr;                     \
-  template <typename T>                                                        \
-  typename Registry<T>::node *Registry<T>::Tail = nullptr;                     \
-  template class LLVM_ABI_EXPORT Registry<REGISTRY_CLASS::type>;               \
-  }
-#else
-#define LLVM_INSTANTIATE_REGISTRY(REGISTRY_CLASS)                              \
-  namespace llvm {                                                             \
-  template <typename T>                                                        \
-  typename Registry<T>::node *Registry<T>::Head = nullptr;                     \
-  template <typename T>                                                        \
-  typename Registry<T>::node *Registry<T>::Tail = nullptr;                     \
-  template class Registry<REGISTRY_CLASS::type>;                               \
+///
+/// This provides template definitions of add_node, begin, and the Head and Tail
+/// pointers, then explicitly instantiates them. We could explicitly specialize
+/// them, instead of the two-step process of define then instantiate, but
+/// strictly speaking that's not allowed by the C++ standard (we would need to
+/// have explicit specialization declarations in all translation units where the
+/// specialization is used) so we don't.
+#define LLVM_INSTANTIATE_REGISTRY(REGISTRY_CLASS) \
+  namespace llvm { \
+  template<typename T> typename Registry<T>::node *Registry<T>::Head = nullptr;\
+  template<typename T> typename Registry<T>::node *Registry<T>::Tail = nullptr;\
+  template<typename T> \
+  void Registry<T>::add_node(typename Registry<T>::node *N) { \
+    if (Tail) \
+      Tail->Next = N; \
+    else \
+      Head = N; \
+    Tail = N; \
+  } \
+  template<typename T> typename Registry<T>::iterator Registry<T>::begin() { \
+    return iterator(Head); \
+  } \
+  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Head; \
+  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Tail; \
+  template \
+  void Registry<REGISTRY_CLASS::type>::add_node(REGISTRY_CLASS::node*); \
+  template REGISTRY_CLASS::iterator Registry<REGISTRY_CLASS::type>::begin(); \
   }
-#endif
 
 #endif // LLVM_SUPPORT_REGISTRY_H

>From 282ab2f1895450707c9f8fc6a46634620165d1c9 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Tue, 15 Oct 2024 23:11:30 -0700
Subject: [PATCH 05/82] [lldb] Avoid repeated hash lookups (NFC) (#112471)

---
 lldb/source/Core/Progress.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Core/Progress.cpp b/lldb/source/Core/Progress.cpp
index 27774ce7a5521b..c9a556472c06b6 100644
--- a/lldb/source/Core/Progress.cpp
+++ b/lldb/source/Core/Progress.cpp
@@ -151,10 +151,11 @@ void ProgressManager::Decrement(const Progress::ProgressData &progress_data) {
   std::lock_guard<std::mutex> lock(m_entries_mutex);
   llvm::StringRef key = progress_data.title;
 
-  if (!m_entries.contains(key))
+  auto it = m_entries.find(key);
+  if (it == m_entries.end())
     return;
 
-  Entry &entry = m_entries[key];
+  Entry &entry = it->second;
   entry.refcount--;
 
   if (entry.refcount == 0) {

>From cc5b5ca34b93e05199527c969a04e44f13653620 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 16 Oct 2024 07:13:37 +0100
Subject: [PATCH 06/82] [LV] Add test where interleave group start pointer is
 incorrect.

Test case from https://github.com/llvm/llvm-project/pull/106431.
---
 ...aved-accesses-different-insert-position.ll | 88 +++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
index 953c93756fef45..665fd1b9aeacaa 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
@@ -154,6 +154,92 @@ loop.latch:
 exit:
   ret void
 }
+
+; FIXME: Currently the start address of the interleav group is computed
+; incorrectly.
+define i64 @interleave_group_load_pointer_type(ptr %start, ptr %end) {
+; CHECK-LABEL: define i64 @interleave_group_load_pointer_type(
+; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[END1]], [[START2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv i64 [[TMP0]], 24
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[N_VEC]], 24
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 24
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x ptr>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x ptr> [[WIDE_VEC]], <12 x ptr> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x ptr> [[WIDE_VEC]], <12 x ptr> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint <4 x ptr> [[STRIDED_VEC3]] to <4 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint <4 x ptr> [[STRIDED_VEC]] to <4 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i64> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12]] = or <4 x i64> [[TMP11]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP12]])
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_16:%.*]] = getelementptr i8, ptr [[PTR_IV]], i64 16
+; CHECK-NEXT:    [[L_16:%.*]] = load ptr, ptr [[GEP_16]], align 8
+; CHECK-NEXT:    [[P_16:%.*]] = ptrtoint ptr [[L_16]] to i64
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr i8, ptr [[PTR_IV]], i64 8
+; CHECK-NEXT:    [[L_8:%.*]] = load ptr, ptr [[GEP_8]], align 8
+; CHECK-NEXT:    [[P_8:%.*]] = ptrtoint ptr [[L_8]] to i64
+; CHECK-NEXT:    [[OR_1:%.*]] = or i64 [[P_16]], [[P_8]]
+; CHECK-NEXT:    [[RED_NEXT]] = or i64 [[OR_1]], [[RED]]
+; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr nusw i8, ptr [[PTR_IV]], i64 24
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[RED_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop ]
+  %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
+  %gep.16 = getelementptr i8, ptr %ptr.iv, i64 16
+  %l.16 = load ptr, ptr %gep.16, align 8
+  %p.16 = ptrtoint ptr %l.16 to i64
+  %gep.8 = getelementptr i8, ptr %ptr.iv, i64 8
+  %l.8 = load ptr, ptr %gep.8, align 8
+  %p.8 = ptrtoint ptr %l.8 to i64
+  %or.1 = or i64 %p.16, %p.8
+  %red.next = or i64 %or.1, %red
+  %ptr.iv.next = getelementptr nusw i8, ptr %ptr.iv, i64 24
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i64 %red.next
+}
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -161,4 +247,6 @@ exit:
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
 ;.

>From 49de1541655cc71cfedbee10d6b4a4c12fc7e13b Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman at google.com>
Date: Tue, 15 Oct 2024 23:15:05 -0700
Subject: [PATCH 07/82] [CMake] Do not set CMP0114 explicitly to old (#90384)

CMP0114 was originally set to old to get rid of warnings. However, this
behavior is now set to new by default with the minimum CMake version
that LLVM requires so does not produce any warnings, and setting it
explicitly to old does produce a warning in newer CMake versions. Due to
these reasons, remove this check for now.

Splitting off from removing the CMP0116 check just in case something
breaks.

Partially fixes #83727.
---
 cmake/Modules/CMakePolicy.cmake | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/cmake/Modules/CMakePolicy.cmake b/cmake/Modules/CMakePolicy.cmake
index 665af01d43bd24..f19dfd71657171 100644
--- a/cmake/Modules/CMakePolicy.cmake
+++ b/cmake/Modules/CMakePolicy.cmake
@@ -1,10 +1,5 @@
 # CMake policy settings shared between LLVM projects
 
-# CMP0114: ExternalProject step targets fully adopt their steps.
-# New in CMake 3.19: https://cmake.org/cmake/help/latest/policy/CMP0114.html
-if(POLICY CMP0114)
-  cmake_policy(SET CMP0114 OLD)
-endif()
 # CMP0116: Ninja generators transform `DEPFILE`s from `add_custom_command()`
 # New in CMake 3.20. https://cmake.org/cmake/help/latest/policy/CMP0116.html
 if(POLICY CMP0116)

>From 484c02780bad58aa99baf3621530d334c6c0d59b Mon Sep 17 00:00:00 2001
From: Sirui Mu <msrlancern at gmail.com>
Date: Wed, 16 Oct 2024 14:14:06 +0800
Subject: [PATCH 08/82] Revert "[mlir][LLVMIR] Add operand bundle support for
 llvm.intr.assume (#112143)"

This reverts commit d8fadad07c952c4aea967aefb0900e4e43ad0555.

The commit breaks the following CI builds:
- ppc64le-mlir-rhel-clang: https://lab.llvm.org/buildbot/#/builders/129/builds/7685
- ppc64le-flang-rhel-clang: https://lab.llvm.org/buildbot/#/builders/157/builds/10338
---
 .../Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td   |  1 -
 .../mlir/Dialect/LLVMIR/LLVMDialect.td        |  2 -
 .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td   | 44 ++-------
 .../include/mlir/Dialect/LLVMIR/LLVMOpBase.td | 25 ++---
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td   | 18 +++-
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  |  2 +-
 .../include/mlir/Target/LLVMIR/ModuleImport.h |  2 -
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    | 96 +++++++------------
 .../LLVMIR/LLVMIRToLLVMTranslation.cpp        |  6 --
 .../LLVMIR/LLVMToLLVMIRTranslation.cpp        | 16 +---
 .../Dialect/NVVM/LLVMIRToNVVMTranslation.cpp  |  6 --
 mlir/lib/Target/LLVMIR/ModuleImport.cpp       | 32 +------
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  | 37 +------
 .../expand-then-convert-to-llvm.mlir          |  2 +-
 .../MemRefToLLVM/memref-to-llvm.mlir          |  4 +-
 mlir/test/Dialect/LLVMIR/inlining.mlir        |  4 +-
 mlir/test/Dialect/LLVMIR/roundtrip.mlir       | 27 ------
 mlir/test/Target/LLVMIR/Import/intrinsic.ll   | 12 +--
 .../test/Target/LLVMIR/llvmir-intrinsics.mlir | 15 ---
 mlir/test/Target/LLVMIR/llvmir-invalid.mlir   |  2 +-
 20 files changed, 77 insertions(+), 276 deletions(-)

diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
index e81db32bcaad03..0e38325f9891ac 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
@@ -71,7 +71,6 @@ class ArmSME_IntrOp<string mnemonic,
           /*bit requiresAccessGroup=*/0,
           /*bit requiresAliasAnalysis=*/0,
           /*bit requiresFastmath=*/0,
-          /*bit requiresOpBundles=*/0,
           /*list<int> immArgPositions=*/immArgPositions,
           /*list<string> immArgAttrNames=*/immArgAttrNames>;
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
index ea82f7f7b8e124..27a2b418aadb2a 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
@@ -59,8 +59,6 @@ def LLVM_Dialect : Dialect {
     static StringRef getStructRetAttrName() { return "llvm.sret"; }
     static StringRef getWriteOnlyAttrName() { return "llvm.writeonly"; }
     static StringRef getZExtAttrName() { return "llvm.zeroext"; }
-    static StringRef getOpBundleSizesAttrName() { return "op_bundle_sizes"; }
-    static StringRef getOpBundleTagsAttrName() { return "op_bundle_tags"; }
     // TODO Restrict the usage of this to parameter attributes once there is an
     // alternative way of modeling memory effects on FunctionOpInterface.
     /// Name of the attribute that will cause the creation of a readnone memory
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index 845c88b1be7750..ab40c8ec4b6588 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -120,8 +120,7 @@ def LLVM_Log2Op : LLVM_UnaryIntrOpF<"log2">;
 def LLVM_LogOp : LLVM_UnaryIntrOpF<"log">;
 def LLVM_Prefetch : LLVM_ZeroResultIntrOp<"prefetch", [0],
   /*traits=*/[], /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-  /*requiresOpBundles=*/0, /*immArgPositions=*/[1, 2, 3],
-  /*immArgAttrNames=*/["rw", "hint", "cache"]
+  /*immArgPositions=*/[1, 2, 3], /*immArgAttrNames=*/["rw", "hint", "cache"]
 > {
   let arguments = (ins LLVM_AnyPointer:$addr, I32Attr:$rw, I32Attr:$hint, I32Attr:$cache);
 }
@@ -177,8 +176,7 @@ class LLVM_MemcpyIntrOpBase<string name> :
      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
      DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>],
     /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1,
-    /*requiresOpBundles=*/0, /*immArgPositions=*/[3],
-    /*immArgAttrNames=*/["isVolatile"]> {
+    /*immArgPositions=*/[3], /*immArgAttrNames=*/["isVolatile"]> {
   dag args = (ins Arg<LLVM_AnyPointer,"",[MemWrite]>:$dst,
                   Arg<LLVM_AnyPointer,"",[MemRead]>:$src,
                   AnySignlessInteger:$len, I1Attr:$isVolatile);
@@ -208,8 +206,7 @@ def LLVM_MemcpyInlineOp :
      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
      DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>],
     /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1,
-    /*requiresOpBundles=*/0, /*immArgPositions=*/[2, 3],
-    /*immArgAttrNames=*/["len", "isVolatile"]> {
+    /*immArgPositions=*/[2, 3], /*immArgAttrNames=*/["len", "isVolatile"]> {
   dag args = (ins Arg<LLVM_AnyPointer,"",[MemWrite]>:$dst,
                   Arg<LLVM_AnyPointer,"",[MemRead]>:$src,
                   APIntAttr:$len, I1Attr:$isVolatile);
@@ -235,8 +232,7 @@ def LLVM_MemsetOp : LLVM_ZeroResultIntrOp<"memset", [0, 2],
      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
      DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>],
     /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1,
-    /*requiresOpBundles=*/0, /*immArgPositions=*/[3],
-    /*immArgAttrNames=*/["isVolatile"]> {
+    /*immArgPositions=*/[3], /*immArgAttrNames=*/["isVolatile"]> {
   dag args = (ins Arg<LLVM_AnyPointer,"",[MemWrite]>:$dst,
                   I8:$val, AnySignlessInteger:$len, I1Attr:$isVolatile);
   // Append the alias attributes defined by LLVM_IntrOpBase.
@@ -290,8 +286,7 @@ def LLVM_NoAliasScopeDeclOp
 class LLVM_LifetimeBaseOp<string opName> : LLVM_ZeroResultIntrOp<opName, [1],
     [DeclareOpInterfaceMethods<PromotableOpInterface>],
     /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-    /*requiresOpBundles=*/0, /*immArgPositions=*/[0],
-    /*immArgAttrNames=*/["size"]> {
+    /*immArgPositions=*/[0], /*immArgAttrNames=*/["size"]> {
   let arguments = (ins I64Attr:$size, LLVM_AnyPointer:$ptr);
   let assemblyFormat = "$size `,` $ptr attr-dict `:` qualified(type($ptr))";
 }
@@ -311,8 +306,7 @@ def LLVM_InvariantStartOp : LLVM_OneResultIntrOp<"invariant.start", [], [1],
 def LLVM_InvariantEndOp : LLVM_ZeroResultIntrOp<"invariant.end", [2],
     [DeclareOpInterfaceMethods<PromotableOpInterface>],
     /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-    /*requiresOpBundles=*/0, /*immArgPositions=*/[1],
-    /*immArgAttrNames=*/["size"]> {
+    /*immArgPositions=*/[1], /*immArgAttrNames=*/["size"]> {
   let arguments = (ins LLVM_DefaultPointer:$start,
                        I64Attr:$size,
                        LLVM_AnyPointer:$ptr);
@@ -374,7 +368,7 @@ class LLVM_ConstrainedIntr<string mnem, int numArgs,
     SmallVector<Value> mlirOperands;
     SmallVector<NamedAttribute> mlirAttrs;
     if (failed(moduleImport.convertIntrinsicArguments(
-        llvmOperands.take_front( }] # numArgs # [{), {}, false,
+        llvmOperands.take_front( }] # numArgs # [{),
         {}, {}, mlirOperands, mlirAttrs))) {
       return failure();
     }
@@ -435,26 +429,7 @@ def LLVM_USHLSat : LLVM_BinarySameArgsIntrOpI<"ushl.sat">;
 //
 
 def LLVM_AssumeOp
-    : LLVM_ZeroResultIntrOp<"assume", /*overloadedOperands=*/[], /*traits=*/[],
-                            /*requiresAccessGroup=*/0,
-                            /*requiresAliasAnalysis=*/0,
-                            /*requiresOpBundles=*/1> {
-  dag args = (ins I1:$cond);
-  let arguments = !con(args, opBundleArgs);
-
-  let assemblyFormat = [{
-    $cond
-    ( custom<OpBundles>($op_bundle_operands, type($op_bundle_operands),
-                        $op_bundle_tags)^ )?
-    `:` type($cond) attr-dict
-  }];
-
-  let builders = [
-    OpBuilder<(ins "Value":$cond)>
-  ];
-
-  let hasVerifier = 1;
-}
+  : LLVM_ZeroResultIntrOp<"assume", []>, Arguments<(ins I1:$cond)>;
 
 def LLVM_SSACopyOp : LLVM_OneResultIntrOp<"ssa.copy", [], [0],
                                             [Pure, SameOperandsAndResultType]> {
@@ -1017,8 +992,7 @@ def LLVM_DebugTrap : LLVM_ZeroResultIntrOp<"debugtrap">;
 def LLVM_UBSanTrap : LLVM_ZeroResultIntrOp<"ubsantrap",
   /*overloadedOperands=*/[], /*traits=*/[],
   /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-  /*requiresOpBundles=*/0, /*immArgPositions=*/[0],
-  /*immArgAttrNames=*/["failureKind"]> {
+  /*immArgPositions=*/[0], /*immArgAttrNames=*/["failureKind"]> {
   let arguments = (ins I8Attr:$failureKind);
 }
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
index a38dafa4d9cf34..c3d352d8d0dd48 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -291,7 +291,7 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
                       list<int> overloadedResults, list<int> overloadedOperands,
                       list<Trait> traits, int numResults,
                       bit requiresAccessGroup = 0, bit requiresAliasAnalysis = 0,
-                      bit requiresFastmath = 0, bit requiresOpBundles = 0,
+                      bit requiresFastmath = 0,
                       list<int> immArgPositions = [],
                       list<string> immArgAttrNames = []>
     : LLVM_OpBase<dialect, opName, !listconcat(
@@ -313,12 +313,6 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
                  OptionalAttr<LLVM_AliasScopeArrayAttr>:$noalias_scopes,
                  OptionalAttr<LLVM_TBAATagArrayAttr>:$tbaa),
             (ins )));
-  dag opBundleArgs = !if(!gt(requiresOpBundles, 0),
-                         (ins VariadicOfVariadic<LLVM_Type,
-                                "op_bundle_sizes">:$op_bundle_operands,
-                              DenseI32ArrayAttr:$op_bundle_sizes,
-                              OptionalAttr<ArrayAttr>:$op_bundle_tags),
-                         (ins ));
   string llvmEnumName = enumName;
   string overloadedResultsCpp =  "{" # !interleave(overloadedResults, ", ") # "}";
   string overloadedOperandsCpp =  "{" # !interleave(overloadedOperands, ", ") # "}";
@@ -342,8 +336,6 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
     SmallVector<NamedAttribute> mlirAttrs;
     if (failed(moduleImport.convertIntrinsicArguments(
       llvmOperands,
-      llvmOpBundles,
-      }] # !if(!gt(requiresOpBundles, 0), "true", "false") # [{,
       }] # immArgPositionsCpp # [{,
       }] # immArgAttrNamesCpp # [{,
       mlirOperands,
@@ -389,14 +381,12 @@ class LLVM_IntrOp<string mnem, list<int> overloadedResults,
                   list<int> overloadedOperands, list<Trait> traits,
                   int numResults, bit requiresAccessGroup = 0,
                   bit requiresAliasAnalysis = 0, bit requiresFastmath = 0,
-                  bit requiresOpBundles = 0,
                   list<int> immArgPositions = [],
                   list<string> immArgAttrNames = []>
     : LLVM_IntrOpBase<LLVM_Dialect, "intr." # mnem, !subst(".", "_", mnem),
                       overloadedResults, overloadedOperands, traits,
                       numResults, requiresAccessGroup, requiresAliasAnalysis,
-                      requiresFastmath, requiresOpBundles, immArgPositions,
-                      immArgAttrNames>;
+                      requiresFastmath, immArgPositions, immArgAttrNames>;
 
 // Base class for LLVM intrinsic operations returning no results. Places the
 // intrinsic into the LLVM dialect and prefixes its name with "intr.".
@@ -416,13 +406,11 @@ class LLVM_ZeroResultIntrOp<string mnem, list<int> overloadedOperands = [],
                             list<Trait> traits = [],
                             bit requiresAccessGroup = 0,
                             bit requiresAliasAnalysis = 0,
-                            bit requiresOpBundles = 0,
                             list<int> immArgPositions = [],
                             list<string> immArgAttrNames = []>
     : LLVM_IntrOp<mnem, [], overloadedOperands, traits, /*numResults=*/0,
                   requiresAccessGroup, requiresAliasAnalysis,
-                  /*requiresFastMath=*/0, requiresOpBundles, immArgPositions,
-                  immArgAttrNames>;
+                  /*requiresFastMath=*/0, immArgPositions, immArgAttrNames>;
 
 // Base class for LLVM intrinsic operations returning one result. Places the
 // intrinsic into the LLVM dialect and prefixes its name with "intr.". This is
@@ -434,12 +422,11 @@ class LLVM_OneResultIntrOp<string mnem, list<int> overloadedResults = [],
                            list<int> overloadedOperands = [],
                            list<Trait> traits = [],
                            bit requiresFastmath = 0,
-                           list<int> immArgPositions = [],
-                           list<string> immArgAttrNames = []>
+                          list<int> immArgPositions = [],
+                          list<string> immArgAttrNames = []>
     : LLVM_IntrOp<mnem, overloadedResults, overloadedOperands, traits, 1,
                   /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-                  requiresFastmath, /*requiresOpBundles=*/0, immArgPositions,
-                  immArgAttrNames>;
+                  requiresFastmath, immArgPositions, immArgAttrNames>;
 
 def LLVM_OneResultOpBuilder :
   OpBuilder<(ins "Type":$resultType, "ValueRange":$operands,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index d5def510a904d3..bbca7bc7286acb 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -559,7 +559,11 @@ def LLVM_InvokeOp : LLVM_Op<"invoke", [
                    VariadicOfVariadic<LLVM_Type,
                                       "op_bundle_sizes">:$op_bundle_operands,
                    DenseI32ArrayAttr:$op_bundle_sizes,
-                   OptionalAttr<ArrayAttr>:$op_bundle_tags);
+                   DefaultValuedProperty<
+                     ArrayProperty<StringProperty, "operand bundle tags">,
+                     "ArrayRef<std::string>{}",
+                     "SmallVector<std::string>{}"
+                   >:$op_bundle_tags);
   let results = (outs Optional<LLVM_Type>:$result);
   let successors = (successor AnySuccessor:$normalDest,
                               AnySuccessor:$unwindDest);
@@ -674,7 +678,11 @@ def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
                   VariadicOfVariadic<LLVM_Type,
                                      "op_bundle_sizes">:$op_bundle_operands,
                   DenseI32ArrayAttr:$op_bundle_sizes,
-                  OptionalAttr<ArrayAttr>:$op_bundle_tags);
+                  DefaultValuedProperty<
+                    ArrayProperty<StringProperty, "operand bundle tags">,
+                    "ArrayRef<std::string>{}",
+                    "SmallVector<std::string>{}"
+                  >:$op_bundle_tags);
   // Append the aliasing related attributes defined in LLVM_MemAccessOpBase.
   let arguments = !con(args, aliasAttrs);
   let results = (outs Optional<LLVM_Type>:$result);
@@ -1922,7 +1930,11 @@ def LLVM_CallIntrinsicOp
                        VariadicOfVariadic<LLVM_Type,
                                           "op_bundle_sizes">:$op_bundle_operands,
                        DenseI32ArrayAttr:$op_bundle_sizes,
-                       OptionalAttr<ArrayAttr>:$op_bundle_tags);
+                       DefaultValuedProperty<
+                         ArrayProperty<StringProperty, "operand bundle tags">,
+                         "ArrayRef<std::string>{}",
+                         "SmallVector<std::string>{}"
+                       >:$op_bundle_tags);
   let results = (outs Optional<LLVM_Type>:$results);
   let llvmBuilder = [{
     return convertCallLLVMIntrinsicOp(op, builder, moduleTranslation);
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 3695708439d91f..c40ae4b1016b49 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -98,7 +98,7 @@ class ROCDL_IntrOp<string mnemonic, list<int> overloadedResults,
   LLVM_IntrOpBase<ROCDL_Dialect,  mnemonic,
     "amdgcn_" # !subst(".", "_", mnemonic), overloadedResults,
     overloadedOperands, traits, numResults, requiresAccessGroup,
-    requiresAliasAnalysis, 0, 0, immArgPositions, immArgAttrNames>;
+    requiresAliasAnalysis, 0, immArgPositions, immArgAttrNames>;
 
 //===----------------------------------------------------------------------===//
 // ROCDL special register op definitions
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
index bbb7af58d27393..9f300bcafea537 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
@@ -243,8 +243,6 @@ class ModuleImport {
   /// corresponding MLIR attribute names.
   LogicalResult
   convertIntrinsicArguments(ArrayRef<llvm::Value *> values,
-                            ArrayRef<llvm::OperandBundleUse> opBundles,
-                            bool requiresOpBundles,
                             ArrayRef<unsigned> immArgPositions,
                             ArrayRef<StringLiteral> immArgAttrNames,
                             SmallVectorImpl<Value> &valuesOut,
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index cc73878a64ff67..12ed8cc88ae7b7 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -241,18 +241,13 @@ static void printOneOpBundle(OpAsmPrinter &p, OperandRange operands,
 static void printOpBundles(OpAsmPrinter &p, Operation *op,
                            OperandRangeRange opBundleOperands,
                            TypeRangeRange opBundleOperandTypes,
-                           std::optional<ArrayAttr> opBundleTags) {
-  if (opBundleOperands.empty())
-    return;
-  assert(opBundleTags && "expect operand bundle tags");
-
+                           ArrayRef<std::string> opBundleTags) {
   p << "[";
   llvm::interleaveComma(
-      llvm::zip(opBundleOperands, opBundleOperandTypes, *opBundleTags), p,
+      llvm::zip(opBundleOperands, opBundleOperandTypes, opBundleTags), p,
       [&p](auto bundle) {
-        auto bundleTag = cast<StringAttr>(std::get<2>(bundle)).getValue();
         printOneOpBundle(p, std::get<0>(bundle), std::get<1>(bundle),
-                         bundleTag);
+                         std::get<2>(bundle));
       });
   p << "]";
 }
@@ -261,7 +256,7 @@ static ParseResult parseOneOpBundle(
     OpAsmParser &p,
     SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> &opBundleOperands,
     SmallVector<SmallVector<Type>> &opBundleOperandTypes,
-    SmallVector<Attribute> &opBundleTags) {
+    SmallVector<std::string> &opBundleTags) {
   SMLoc currentParserLoc = p.getCurrentLocation();
   SmallVector<OpAsmParser::UnresolvedOperand> operands;
   SmallVector<Type> types;
@@ -281,7 +276,7 @@ static ParseResult parseOneOpBundle(
 
   opBundleOperands.push_back(std::move(operands));
   opBundleOperandTypes.push_back(std::move(types));
-  opBundleTags.push_back(StringAttr::get(p.getContext(), tag));
+  opBundleTags.push_back(std::move(tag));
 
   return success();
 }
@@ -290,17 +285,16 @@ static std::optional<ParseResult> parseOpBundles(
     OpAsmParser &p,
     SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> &opBundleOperands,
     SmallVector<SmallVector<Type>> &opBundleOperandTypes,
-    ArrayAttr &opBundleTags) {
+    SmallVector<std::string> &opBundleTags) {
   if (p.parseOptionalLSquare())
     return std::nullopt;
 
   if (succeeded(p.parseOptionalRSquare()))
     return success();
 
-  SmallVector<Attribute> opBundleTagAttrs;
   auto bundleParser = [&] {
     return parseOneOpBundle(p, opBundleOperands, opBundleOperandTypes,
-                            opBundleTagAttrs);
+                            opBundleTags);
   };
   if (p.parseCommaSeparatedList(bundleParser))
     return failure();
@@ -308,8 +302,6 @@ static std::optional<ParseResult> parseOpBundles(
   if (p.parseRSquare())
     return failure();
 
-  opBundleTags = ArrayAttr::get(p.getContext(), opBundleTagAttrs);
-
   return success();
 }
 
@@ -1047,7 +1039,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr,
         /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1074,7 +1066,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
         /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr,
         /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
         /*access_groups=*/nullptr,
         /*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1087,7 +1079,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
         /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1100,7 +1092,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
         /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1200,20 +1192,12 @@ LogicalResult verifyCallOpVarCalleeType(OpTy callOp) {
 template <typename OpType>
 static LogicalResult verifyOperandBundles(OpType &op) {
   OperandRangeRange opBundleOperands = op.getOpBundleOperands();
-  std::optional<ArrayAttr> opBundleTags = op.getOpBundleTags();
+  ArrayRef<std::string> opBundleTags = op.getOpBundleTags();
 
-  auto isStringAttr = [](Attribute tagAttr) {
-    return isa<StringAttr>(tagAttr);
-  };
-  if (opBundleTags && !llvm::all_of(*opBundleTags, isStringAttr))
-    return op.emitError("operand bundle tag must be a StringAttr");
-
-  size_t numOpBundles = opBundleOperands.size();
-  size_t numOpBundleTags = opBundleTags ? opBundleTags->size() : 0;
-  if (numOpBundles != numOpBundleTags)
+  if (opBundleTags.size() != opBundleOperands.size())
     return op.emitError("expected ")
-           << numOpBundles << " operand bundle tags, but actually got "
-           << numOpBundleTags;
+           << opBundleOperands.size()
+           << " operand bundle tags, but actually got " << opBundleTags.size();
 
   return success();
 }
@@ -1345,8 +1329,7 @@ void CallOp::print(OpAsmPrinter &p) {
                           {getCalleeAttrName(), getTailCallKindAttrName(),
                            getVarCalleeTypeAttrName(), getCConvAttrName(),
                            getOperandSegmentSizesAttrName(),
-                           getOpBundleSizesAttrName(),
-                           getOpBundleTagsAttrName()});
+                           getOpBundleSizesAttrName()});
 
   p << " : ";
   if (!isDirect)
@@ -1454,7 +1437,7 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
   SmallVector<OpAsmParser::UnresolvedOperand> operands;
   SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> opBundleOperands;
   SmallVector<SmallVector<Type>> opBundleOperandTypes;
-  ArrayAttr opBundleTags;
+  SmallVector<std::string> opBundleTags;
 
   // Default to C Calling Convention if no keyword is provided.
   result.addAttribute(
@@ -1500,9 +1483,9 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
           parser, opBundleOperands, opBundleOperandTypes, opBundleTags);
       result && failed(*result))
     return failure();
-  if (opBundleTags && !opBundleTags.empty())
-    result.addAttribute(CallOp::getOpBundleTagsAttrName(result.name).getValue(),
-                        opBundleTags);
+  if (!opBundleTags.empty())
+    result.getOrAddProperties<CallOp::Properties>().op_bundle_tags =
+        std::move(opBundleTags);
 
   if (parser.parseOptionalAttrDict(result.attributes))
     return failure();
@@ -1542,7 +1525,8 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
   auto calleeType = func.getFunctionType();
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType), SymbolRefAttr::get(func), ops,
-        normalOps, unwindOps, nullptr, nullptr, {}, {}, normal, unwind);
+        normalOps, unwindOps, nullptr, nullptr, {}, std::nullopt, normal,
+        unwind);
 }
 
 void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys,
@@ -1551,7 +1535,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys,
                      ValueRange unwindOps) {
   build(builder, state, tys,
         /*var_callee_type=*/nullptr, callee, ops, normalOps, unwindOps, nullptr,
-        nullptr, {}, {}, normal, unwind);
+        nullptr, {}, std::nullopt, normal, unwind);
 }
 
 void InvokeOp::build(OpBuilder &builder, OperationState &state,
@@ -1560,7 +1544,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state,
                      Block *unwind, ValueRange unwindOps) {
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType), callee, ops, normalOps, unwindOps,
-        nullptr, nullptr, {}, {}, normal, unwind);
+        nullptr, nullptr, {}, std::nullopt, normal, unwind);
 }
 
 SuccessorOperands InvokeOp::getSuccessorOperands(unsigned index) {
@@ -1650,8 +1634,7 @@ void InvokeOp::print(OpAsmPrinter &p) {
   p.printOptionalAttrDict((*this)->getAttrs(),
                           {getCalleeAttrName(), getOperandSegmentSizeAttr(),
                            getCConvAttrName(), getVarCalleeTypeAttrName(),
-                           getOpBundleSizesAttrName(),
-                           getOpBundleTagsAttrName()});
+                           getOpBundleSizesAttrName()});
 
   p << " : ";
   if (!isDirect)
@@ -1674,7 +1657,7 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
   TypeAttr varCalleeType;
   SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> opBundleOperands;
   SmallVector<SmallVector<Type>> opBundleOperandTypes;
-  ArrayAttr opBundleTags;
+  SmallVector<std::string> opBundleTags;
   Block *normalDest, *unwindDest;
   SmallVector<Value, 4> normalOperands, unwindOperands;
   Builder &builder = parser.getBuilder();
@@ -1720,10 +1703,9 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
           parser, opBundleOperands, opBundleOperandTypes, opBundleTags);
       result && failed(*result))
     return failure();
-  if (opBundleTags && !opBundleTags.empty())
-    result.addAttribute(
-        InvokeOp::getOpBundleTagsAttrName(result.name).getValue(),
-        opBundleTags);
+  if (!opBundleTags.empty())
+    result.getOrAddProperties<InvokeOp::Properties>().op_bundle_tags =
+        std::move(opBundleTags);
 
   if (parser.parseOptionalAttrDict(result.attributes))
     return failure();
@@ -3351,7 +3333,7 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::StringAttr intrin, mlir::ValueRange args) {
   build(builder, state, /*resultTypes=*/TypeRange{}, intrin, args,
         FastmathFlagsAttr{},
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
+        /*op_bundle_operands=*/{});
 }
 
 void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
@@ -3359,14 +3341,14 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::LLVM::FastmathFlagsAttr fastMathFlags) {
   build(builder, state, /*resultTypes=*/TypeRange{}, intrin, args,
         fastMathFlags,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
+        /*op_bundle_operands=*/{});
 }
 
 void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::Type resultType, mlir::StringAttr intrin,
                             mlir::ValueRange args) {
   build(builder, state, {resultType}, intrin, args, FastmathFlagsAttr{},
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
+        /*op_bundle_operands=*/{});
 }
 
 void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
@@ -3374,7 +3356,7 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::StringAttr intrin, mlir::ValueRange args,
                             mlir::LLVM::FastmathFlagsAttr fastMathFlags) {
   build(builder, state, resultTypes, intrin, args, fastMathFlags,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
+        /*op_bundle_operands=*/{});
 }
 
 //===----------------------------------------------------------------------===//
@@ -3431,18 +3413,6 @@ void InlineAsmOp::getEffects(
   }
 }
 
-//===----------------------------------------------------------------------===//
-// AssumeOp (intrinsic)
-//===----------------------------------------------------------------------===//
-
-void LLVM::AssumeOp::build(OpBuilder &builder, OperationState &state,
-                           mlir::Value cond) {
-  return build(builder, state, cond, /*op_bundle_operands=*/{},
-               /*op_bundle_tags=*/{});
-}
-
-LogicalResult LLVM::AssumeOp::verify() { return verifyOperandBundles(*this); }
-
 //===----------------------------------------------------------------------===//
 // masked_gather (intrinsic)
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
index 4fd043c7c93e68..d034e576dfc579 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
@@ -68,12 +68,6 @@ static LogicalResult convertIntrinsicImpl(OpBuilder &odsBuilder,
   if (isConvertibleIntrinsic(intrinsicID)) {
     SmallVector<llvm::Value *> args(inst->args());
     ArrayRef<llvm::Value *> llvmOperands(args);
-
-    SmallVector<llvm::OperandBundleUse> llvmOpBundles;
-    llvmOpBundles.reserve(inst->getNumOperandBundles());
-    for (unsigned i = 0; i < inst->getNumOperandBundles(); ++i)
-      llvmOpBundles.push_back(inst->getOperandBundleAt(i));
-
 #include "mlir/Dialect/LLVMIR/LLVMIntrinsicFromLLVMIRConversions.inc"
   }
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
index 2084e527773ca8..a8595d14ccf2e5 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
@@ -114,27 +114,17 @@ convertOperandBundle(OperandRange bundleOperands, StringRef bundleTag,
 }
 
 static SmallVector<llvm::OperandBundleDef>
-convertOperandBundles(OperandRangeRange bundleOperands, ArrayAttr bundleTags,
+convertOperandBundles(OperandRangeRange bundleOperands,
+                      ArrayRef<std::string> bundleTags,
                       LLVM::ModuleTranslation &moduleTranslation) {
   SmallVector<llvm::OperandBundleDef> bundles;
   bundles.reserve(bundleOperands.size());
 
-  for (auto [operands, tagAttr] : llvm::zip_equal(bundleOperands, bundleTags)) {
-    StringRef tag = cast<StringAttr>(tagAttr).getValue();
+  for (auto [operands, tag] : llvm::zip_equal(bundleOperands, bundleTags))
     bundles.push_back(convertOperandBundle(operands, tag, moduleTranslation));
-  }
   return bundles;
 }
 
-static SmallVector<llvm::OperandBundleDef>
-convertOperandBundles(OperandRangeRange bundleOperands,
-                      std::optional<ArrayAttr> bundleTags,
-                      LLVM::ModuleTranslation &moduleTranslation) {
-  if (!bundleTags)
-    return {};
-  return convertOperandBundles(bundleOperands, *bundleTags, moduleTranslation);
-}
-
 /// Builder for LLVM_CallIntrinsicOp
 static LogicalResult
 convertCallLLVMIntrinsicOp(CallIntrinsicOp op, llvm::IRBuilderBase &builder,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
index 2c0b665ad0d833..bc830a77f3c580 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
@@ -50,12 +50,6 @@ static LogicalResult convertIntrinsicImpl(OpBuilder &odsBuilder,
   if (isConvertibleIntrinsic(intrinsicID)) {
     SmallVector<llvm::Value *> args(inst->args());
     ArrayRef<llvm::Value *> llvmOperands(args);
-
-    SmallVector<llvm::OperandBundleUse> llvmOpBundles;
-    llvmOpBundles.reserve(inst->getNumOperandBundles());
-    for (unsigned i = 0; i < inst->getNumOperandBundles(); ++i)
-      llvmOpBundles.push_back(inst->getOperandBundleAt(i));
-
 #include "mlir/Dialect/LLVMIR/NVVMFromLLVMIRConversions.inc"
   }
 
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index 6e97b2a50af8a1..bd861f3a69e53c 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -1311,8 +1311,7 @@ ModuleImport::convertValues(ArrayRef<llvm::Value *> values) {
 }
 
 LogicalResult ModuleImport::convertIntrinsicArguments(
-    ArrayRef<llvm::Value *> values, ArrayRef<llvm::OperandBundleUse> opBundles,
-    bool requiresOpBundles, ArrayRef<unsigned> immArgPositions,
+    ArrayRef<llvm::Value *> values, ArrayRef<unsigned> immArgPositions,
     ArrayRef<StringLiteral> immArgAttrNames, SmallVectorImpl<Value> &valuesOut,
     SmallVectorImpl<NamedAttribute> &attrsOut) {
   assert(immArgPositions.size() == immArgAttrNames.size() &&
@@ -1342,35 +1341,6 @@ LogicalResult ModuleImport::convertIntrinsicArguments(
     valuesOut.push_back(*mlirValue);
   }
 
-  SmallVector<int> opBundleSizes;
-  SmallVector<Attribute> opBundleTagAttrs;
-  if (requiresOpBundles) {
-    opBundleSizes.reserve(opBundles.size());
-    opBundleTagAttrs.reserve(opBundles.size());
-
-    for (const llvm::OperandBundleUse &bundle : opBundles) {
-      opBundleSizes.push_back(bundle.Inputs.size());
-      opBundleTagAttrs.push_back(StringAttr::get(context, bundle.getTagName()));
-
-      for (const llvm::Use &opBundleOperand : bundle.Inputs) {
-        auto operandMlirValue = convertValue(opBundleOperand.get());
-        if (failed(operandMlirValue))
-          return failure();
-        valuesOut.push_back(*operandMlirValue);
-      }
-    }
-
-    auto opBundleSizesAttr = DenseI32ArrayAttr::get(context, opBundleSizes);
-    auto opBundleSizesAttrNameAttr =
-        StringAttr::get(context, LLVMDialect::getOpBundleSizesAttrName());
-    attrsOut.push_back({opBundleSizesAttrNameAttr, opBundleSizesAttr});
-
-    auto opBundleTagsAttr = ArrayAttr::get(context, opBundleTagAttrs);
-    auto opBundleTagsAttrNameAttr =
-        StringAttr::get(context, LLVMDialect::getOpBundleTagsAttrName());
-    attrsOut.push_back({opBundleTagsAttrNameAttr, opBundleTagsAttr});
-  }
-
   return success();
 }
 
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index e4c097c0daed3e..6e005f9ec5df85 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -55,7 +55,6 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-#include <numeric>
 #include <optional>
 
 #define DEBUG_TYPE "llvm-dialect-to-llvm-ir"
@@ -855,40 +854,8 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall(
          "LLVM `immArgPositions` and MLIR `immArgAttrNames` should have equal "
          "length");
 
-  SmallVector<llvm::OperandBundleDef> opBundles;
-  size_t numOpBundleOperands = 0;
-  auto opBundleSizesAttr = cast_if_present<DenseI32ArrayAttr>(
-      intrOp->getAttr(LLVMDialect::getOpBundleSizesAttrName()));
-  auto opBundleTagsAttr = cast_if_present<ArrayAttr>(
-      intrOp->getAttr(LLVMDialect::getOpBundleTagsAttrName()));
-
-  if (opBundleSizesAttr && opBundleTagsAttr) {
-    ArrayRef<int> opBundleSizes = opBundleSizesAttr.asArrayRef();
-    assert(opBundleSizes.size() == opBundleTagsAttr.size() &&
-           "operand bundles and tags do not match");
-
-    numOpBundleOperands =
-        std::reduce(opBundleSizes.begin(), opBundleSizes.end());
-    assert(numOpBundleOperands <= intrOp->getNumOperands() &&
-           "operand bundle operands is more than the number of operands");
-
-    ValueRange operands = intrOp->getOperands().take_back(numOpBundleOperands);
-    size_t nextOperandIdx = 0;
-    opBundles.reserve(opBundleSizesAttr.size());
-
-    for (auto [opBundleTagAttr, bundleSize] :
-         llvm::zip(opBundleTagsAttr, opBundleSizes)) {
-      auto bundleTag = cast<StringAttr>(opBundleTagAttr).str();
-      auto bundleOperands = moduleTranslation.lookupValues(
-          operands.slice(nextOperandIdx, bundleSize));
-      opBundles.emplace_back(std::move(bundleTag), std::move(bundleOperands));
-      nextOperandIdx += bundleSize;
-    }
-  }
-
   // Map operands and attributes to LLVM values.
-  auto opOperands = intrOp->getOperands().drop_back(numOpBundleOperands);
-  auto operands = moduleTranslation.lookupValues(opOperands);
+  auto operands = moduleTranslation.lookupValues(intrOp->getOperands());
   SmallVector<llvm::Value *> args(immArgPositions.size() + operands.size());
   for (auto [immArgPos, immArgName] :
        llvm::zip(immArgPositions, immArgAttrNames)) {
@@ -923,7 +890,7 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall(
   llvm::Function *llvmIntr = llvm::Intrinsic::getOrInsertDeclaration(
       module, intrinsic, overloadedTypes);
 
-  return builder.CreateCall(llvmIntr, args, opBundles);
+  return builder.CreateCall(llvmIntr, args);
 }
 
 /// Given a single MLIR operation, create the corresponding LLVM IR operation
diff --git a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
index 55b1bc9c545a85..b86103422b0745 100644
--- a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
@@ -684,7 +684,7 @@ func.func @collapse_static_shape_with_non_identity_layout(%arg: memref<1x1x8x8xf
 // CHECK: %[[INT_TO_PTR:.*]] = llvm.ptrtoint %[[BUFF_ADDR]] : !llvm.ptr to i64
 // CHECK: %[[AND:.*]] = llvm.and %[[INT_TO_PTR]], {{.*}}  : i64
 // CHECK: %[[CMP:.*]] = llvm.icmp "eq" %[[AND]], {{.*}} : i64
-// CHECK: llvm.intr.assume %[[CMP]] : i1
+// CHECK: "llvm.intr.assume"(%[[CMP]]) : (i1) -> ()
 // CHECK: %[[LD_ADDR:.*]] = llvm.getelementptr %[[BUFF_ADDR]][%{{.*}}] : (!llvm.ptr, i64) -> !llvm.ptr, f32
 // CHECK: %[[VAL:.*]] = llvm.load %[[LD_ADDR]] : !llvm.ptr -> f32
 // CHECK: return %[[VAL]] : f32
diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
index 48dc9079333d4f..9dc22abf143bf0 100644
--- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
@@ -160,7 +160,7 @@ func.func @assume_alignment(%0 : memref<4x4xf16>) {
   // CHECK-NEXT: %[[INT:.*]] = llvm.ptrtoint %[[PTR]] : !llvm.ptr to i64
   // CHECK-NEXT: %[[MASKED_PTR:.*]] = llvm.and %[[INT]], %[[MASK:.*]] : i64
   // CHECK-NEXT: %[[CONDITION:.*]] = llvm.icmp "eq" %[[MASKED_PTR]], %[[ZERO]] : i64
-  // CHECK-NEXT: llvm.intr.assume %[[CONDITION]] : i1
+  // CHECK-NEXT: "llvm.intr.assume"(%[[CONDITION]]) : (i1) -> ()
   memref.assume_alignment %0, 16 : memref<4x4xf16>
   return
 }
@@ -177,7 +177,7 @@ func.func @assume_alignment_w_offset(%0 : memref<4x4xf16, strided<[?, ?], offset
   // CHECK-NEXT: %[[INT:.*]] = llvm.ptrtoint %[[BUFF_ADDR]] : !llvm.ptr to i64
   // CHECK-NEXT: %[[MASKED_PTR:.*]] = llvm.and %[[INT]], %[[MASK:.*]] : i64
   // CHECK-NEXT: %[[CONDITION:.*]] = llvm.icmp "eq" %[[MASKED_PTR]], %[[ZERO]] : i64
-  // CHECK-NEXT: llvm.intr.assume %[[CONDITION]] : i1
+  // CHECK-NEXT: "llvm.intr.assume"(%[[CONDITION]]) : (i1) -> ()
   memref.assume_alignment %0, 16 : memref<4x4xf16, strided<[?, ?], offset: ?>>
   return
 }
diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir
index 0b7ca3f2bb048a..f9551e311df59f 100644
--- a/mlir/test/Dialect/LLVMIR/inlining.mlir
+++ b/mlir/test/Dialect/LLVMIR/inlining.mlir
@@ -18,7 +18,7 @@ func.func @inner_func_inlinable(%ptr : !llvm.ptr) -> i32 {
   "llvm.intr.memset"(%ptr, %byte, %0) <{isVolatile = true}> : (!llvm.ptr, i8, i32) -> ()
   "llvm.intr.memmove"(%ptr, %ptr, %0) <{isVolatile = true}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
   "llvm.intr.memcpy"(%ptr, %ptr, %0) <{isVolatile = true}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
-  llvm.intr.assume %true : i1
+  "llvm.intr.assume"(%true) : (i1) -> ()
   llvm.fence release
   %2 = llvm.atomicrmw add %ptr, %0 monotonic : !llvm.ptr, i32
   %3 = llvm.cmpxchg %ptr, %0, %1 acq_rel monotonic : !llvm.ptr, i32
@@ -44,7 +44,7 @@ func.func @inner_func_inlinable(%ptr : !llvm.ptr) -> i32 {
 // CHECK: "llvm.intr.memset"(%[[PTR]]
 // CHECK: "llvm.intr.memmove"(%[[PTR]], %[[PTR]]
 // CHECK: "llvm.intr.memcpy"(%[[PTR]], %[[PTR]]
-// CHECK: llvm.intr.assume
+// CHECK: "llvm.intr.assume"
 // CHECK: llvm.fence release
 // CHECK: llvm.atomicrmw add %[[PTR]], %[[CST]] monotonic
 // CHECK: llvm.cmpxchg %[[PTR]], %[[CST]], %[[RES]] acq_rel monotonic
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index b8ce7db795a1d1..3062cdc38c0abb 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -836,30 +836,3 @@ llvm.func @test_call_intrin_with_opbundle(%arg0 : !llvm.ptr) {
   llvm.call_intrinsic "llvm.assume"(%0) ["align"(%arg0, %1 : !llvm.ptr, i32)] : (i1) -> ()
   llvm.return
 }
-
-// CHECK-LABEL: @test_assume_intr_no_opbundle
-llvm.func @test_assume_intr_no_opbundle(%arg0 : !llvm.ptr) {
-  %0 = llvm.mlir.constant(1 : i1) : i1
-  // CHECK: llvm.intr.assume %0 : i1
-  llvm.intr.assume %0 : i1
-  llvm.return
-}
-
-// CHECK-LABEL: @test_assume_intr_empty_opbundle
-llvm.func @test_assume_intr_empty_opbundle(%arg0 : !llvm.ptr) {
-  %0 = llvm.mlir.constant(1 : i1) : i1
-  // CHECK: llvm.intr.assume %0 : i1
-  llvm.intr.assume %0 [] : i1
-  llvm.return
-}
-
-// CHECK-LABEL: @test_assume_intr_with_opbundles
-llvm.func @test_assume_intr_with_opbundles(%arg0 : !llvm.ptr) {
-  %0 = llvm.mlir.constant(1 : i1) : i1
-  %1 = llvm.mlir.constant(2 : i32) : i32
-  %2 = llvm.mlir.constant(3 : i32) : i32
-  %3 = llvm.mlir.constant(4 : i32) : i32
-  // CHECK: llvm.intr.assume %0 ["tag1"(%1, %2 : i32, i32), "tag2"(%3 : i32)] : i1
-  llvm.intr.assume %0 ["tag1"(%1, %2 : i32, i32), "tag2"(%3 : i32)] : i1
-  llvm.return
-}
diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
index 606b11175f572f..28a1bd21c82a38 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
@@ -630,21 +630,11 @@ define void @va_intrinsics_test(ptr %0, ptr %1, ...) {
 ; CHECK-LABEL: @assume
 ; CHECK-SAME:  %[[TRUE:[a-zA-Z0-9]+]]
 define void @assume(i1 %true) {
-  ; CHECK:  llvm.intr.assume %[[TRUE]] : i1
+  ; CHECK:  "llvm.intr.assume"(%[[TRUE]]) : (i1) -> ()
   call void @llvm.assume(i1 %true)
   ret void
 }
 
-; CHECK-LABEL: @assume_with_opbundles
-; CHECK-SAME:  %[[TRUE:[a-zA-Z0-9]+]]
-; CHECK-SAME:  %[[PTR:[a-zA-Z0-9]+]]
-define void @assume_with_opbundles(i1 %true, ptr %p) {
-  ; CHECK: %[[ALIGN:.+]] = llvm.mlir.constant(8 : i32) : i32
-  ; CHECK:  llvm.intr.assume %[[TRUE]] ["align"(%[[PTR]], %[[ALIGN]] : !llvm.ptr, i32)] : i1
-  call void @llvm.assume(i1 %true) ["align"(ptr %p, i32 8)]
-  ret void
-}
-
 ; CHECK-LABEL: @is_constant
 ; CHECK-SAME:  %[[VAL:[a-zA-Z0-9]+]]
 define void @is_constant(i32 %0) {
diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
index cb712eb4e1262d..0634a7ba907f1e 100644
--- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
@@ -363,21 +363,6 @@ llvm.func @umin_test(%arg0: i32, %arg1: i32, %arg2: vector<8xi32>, %arg3: vector
   llvm.return
 }
 
-// CHECK-LABEL: @assume_without_opbundles
-llvm.func @assume_without_opbundles(%cond: i1) {
-  // CHECK: call void @llvm.assume(i1 %{{.+}})
-  llvm.intr.assume %cond : i1
-  llvm.return
-}
-
-// CHECK-LABEL: @assume_with_opbundles
-llvm.func @assume_with_opbundles(%cond: i1, %p: !llvm.ptr) {
-  %0 = llvm.mlir.constant(8 : i32) : i32
-  // CHECK: call void @llvm.assume(i1 %{{.+}}) [ "align"(ptr %{{.+}}, i32 8) ]
-  llvm.intr.assume %cond ["align"(%p, %0 : !llvm.ptr, i32)] : i1
-  llvm.return
-}
-
 // CHECK-LABEL: @vector_reductions
 llvm.func @vector_reductions(%arg0: f32, %arg1: vector<8xf32>, %arg2: vector<8xi32>) {
   // CHECK: call i32 @llvm.vector.reduce.add.v8i32
diff --git a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
index 15658ea6068121..af0981440a1776 100644
--- a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
@@ -188,7 +188,7 @@ llvm.func @sadd_overflow_intr_wrong_type(%arg0 : i32, %arg1 : f32) -> !llvm.stru
 
 llvm.func @assume_intr_wrong_type(%cond : i16) {
   // expected-error @below{{op operand #0 must be 1-bit signless integer, but got 'i16'}}
-  llvm.intr.assume %cond : i16
+  "llvm.intr.assume"(%cond) : (i16) -> ()
   llvm.return
 }
 

>From bbff5b8891c0ce929d6ace2d86ea6891425042e2 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 16 Oct 2024 07:21:57 +0100
Subject: [PATCH 09/82] [VPlan] Use alloc-type to compute interleave group
 offset.

Use getAllocTypeSize to get compute the offset to the start of
interleave groups instead getScalarSizeInBits, which may return 0 for
pointers. This is in line with the analysis building the interleave
groups and fixes a mis-compile reported for
https://github.com/llvm/llvm-project/pull/106431.
---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp              | 3 ++-
 .../interleaved-accesses-different-insert-position.ll          | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 4443a7be4ad45b..faec08cac18751 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1646,8 +1646,9 @@ void VPlanTransforms::createInterleaveGroups(
       // zero.
       assert(IG->getIndex(IRInsertPos) != 0 &&
              "index of insert position shouldn't be zero");
+      auto &DL = IRInsertPos->getDataLayout();
       APInt Offset(32,
-                   getLoadStoreType(IRInsertPos)->getScalarSizeInBits() / 8 *
+                   DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
                        IG->getIndex(IRInsertPos),
                    /*IsSigned=*/true);
       VPValue *OffsetVPV = Plan.getOrAddLiveIn(
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
index 665fd1b9aeacaa..5913bae082f1da 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
@@ -183,7 +183,7 @@ define i64 @interleave_group_load_pointer_type(ptr %start, ptr %end) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 -8
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x ptr>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x ptr> [[WIDE_VEC]], <12 x ptr> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x ptr> [[WIDE_VEC]], <12 x ptr> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>

>From e55869ae8a4ef1ae2e898ff5cd66fb8ae6e099b8 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik at samsung.com>
Date: Wed, 16 Oct 2024 08:30:22 +0200
Subject: [PATCH 10/82] [LV][NFC] Fix typos (#111971)

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 31 +++++++++----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c8cf137816d3a0..8bf92f3480620a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -809,8 +809,7 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
 };
 } // end namespace llvm
 
-/// Look for a meaningful debug location on the instruction or it's
-/// operands.
+/// Look for a meaningful debug location on the instruction or its operands.
 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
   if (!I)
     return DebugLoc();
@@ -1798,7 +1797,7 @@ class GeneratedRTChecks {
 
   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
   /// accurately estimate the cost of the runtime checks. The blocks are
-  /// un-linked from the IR and is added back during vector code generation. If
+  /// un-linked from the IR and are added back during vector code generation. If
   /// there is no vector code generation, the check blocks are removed
   /// completely.
   void create(Loop *L, const LoopAccessInfo &LAI,
@@ -2581,7 +2580,7 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
     }
   }
 
-  // Create phi nodes to merge from the  backedge-taken check block.
+  // Create phi nodes to merge from the backedge-taken check block.
   PHINode *BCResumeVal =
       PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
                       LoopScalarPreHeader->getFirstNonPHIIt());
@@ -3002,7 +3001,8 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
 
       // We can't sink an instruction if it is a phi node, is not in the loop,
       // may have side effects or may read from memory.
-      // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
+      // TODO: Could do more granular checking to allow sinking
+      // a load past non-store instructions.
       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
           I->mayHaveSideEffects() || I->mayReadFromMemory())
           continue;
@@ -3140,9 +3140,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
 
   // (2) Add to the worklist all bitcast and getelementptr instructions used by
   // memory accesses requiring a scalar use. The pointer operands of loads and
-  // stores will be scalar as long as the memory accesses is not a gather or
-  // scatter operation. The value operand of a store will remain scalar if the
-  // store is scalarized.
+  // stores will be scalar unless the operation is a gather or scatter.
+  // The value operand of a store will remain scalar if the store is scalarized.
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
       if (auto *Load = dyn_cast<LoadInst>(&I)) {
@@ -3415,7 +3414,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
   assert(Group && "Must have a group.");
   unsigned InterleaveFactor = Group->getFactor();
 
-  // If the instruction's allocated size doesn't equal it's type size, it
+  // If the instruction's allocated size doesn't equal its type size, it
   // requires padding and will be scalarized.
   auto &DL = I->getDataLayout();
   auto *ScalarTy = getLoadStoreType(I);
@@ -3515,11 +3514,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   assert(VF.isVector() && !Uniforms.contains(VF) &&
          "This function should not be visited twice for the same VF");
 
-  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
-  // not analyze again.  Uniforms.count(VF) will return 1.
+  // Visit the list of Uniforms. If we find no uniform value, we won't
+  // analyze again.  Uniforms.count(VF) will return 1.
   Uniforms[VF].clear();
 
-  // We now know that the loop is vectorizable!
+  // Now we know that the loop is vectorizable!
   // Collect instructions inside the loop that will remain uniform after
   // vectorization.
 
@@ -3566,7 +3565,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
 
   auto PrevVF = VF.divideCoefficientBy(2);
   // Return true if all lanes perform the same memory operation, and we can
-  // thus chose to execute only one.
+  // thus choose to execute only one.
   auto IsUniformMemOpUse = [&](Instruction *I) {
     // If the value was already known to not be uniform for the previous
     // (smaller VF), it cannot be uniform for the larger VF.
@@ -3957,7 +3956,7 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
 FixedScalableVFPair
 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
-    // TODO: It may by useful to do since it's still likely to be dynamically
+    // TODO: It may be useful to do since it's still likely to be dynamically
     // uniform if the target can skip.
     reportVectorizationFailure(
         "Not inserting runtime ptr check for divergent target",
@@ -4031,7 +4030,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
            "No decisions should have been taken at this point");
     // Note: There is no need to invalidate any cost modeling decisions here, as
-    // non where taken so far.
+    // none were taken so far.
     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
   }
 
@@ -7940,7 +7939,7 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
     BasicBlock *Bypass, BasicBlock *Insert) {
 
   assert(EPI.TripCount &&
-         "Expected trip count to have been safed in the first pass.");
+         "Expected trip count to have been saved in the first pass.");
   assert(
       (!isa<Instruction>(EPI.TripCount) ||
        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&

>From 24423107abc23a24d465189ba05e51d1bc31bbf2 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 16 Oct 2024 07:40:04 +0100
Subject: [PATCH 11/82] [LV] Add additional trip count expansion tests for
 #92177.

Extra tests for https://github.com/llvm/llvm-project/pull/92177, split
off the PR.
---
 .../trip-count-expansion-may-introduce-ub.ll  | 310 ++++++++++++++++--
 1 file changed, 289 insertions(+), 21 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll b/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll
index c63dc9979bce3c..7dfd80a688f312 100644
--- a/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll
+++ b/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll
@@ -459,6 +459,7 @@ exit:
   ret i64 %p
 }
 
+; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
 define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch(ptr %dst, i64 %N) {
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
@@ -526,6 +527,152 @@ exit:
   ret i64 %p
 }
 
+declare void @foo()
+
+; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
+define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_call_before_loop(ptr %dst, i64 %N) {
+; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_call_before_loop(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 42, [[N]]
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze i64 [[TMP2]]
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
+; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[SMAX]])
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP4]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i32 1, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[C_0:%.*]] = icmp slt i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[C_0]], label [[LOOP_LATCH]], label [[EXIT:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[D:%.*]] = udiv i64 42, [[N]]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i64 [[P]]
+;
+entry:
+  call void @foo()
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep = getelementptr inbounds i32, ptr %dst, i64 %iv
+  store i32 1, ptr %gep
+  %c.0 = icmp slt i64 %iv, %N
+  br i1 %c.0, label %loop.latch, label %exit
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %d = udiv i64 42, %N
+  %c.1 = icmp slt i64 %iv, %d
+  br i1 %c.1, label %loop.header, label %exit
+
+exit:
+  %p = phi i64 [ 1, %loop.header ], [ 0, %loop.latch]
+  ret i64 %p
+}
+
+; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
+define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_loop_may_not_execute(ptr %dst, i64 %N, i1 %c) {
+; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_loop_may_not_execute(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       loop.header.preheader:
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 42, [[N]]
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze i64 [[TMP2]]
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
+; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[SMAX]])
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP4]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i32 1, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[C_0:%.*]] = icmp slt i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[C_0]], label [[LOOP_LATCH]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[D:%.*]] = udiv i64 42, [[N]]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    [[P_PH:%.*]] = phi i64 [ 0, [[LOOP_LATCH]] ], [ 1, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 2, [[ENTRY:%.*]] ], [ [[P_PH]], [[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i64 [[P]]
+;
+entry:
+  br i1 %c, label %loop.header, label %exit
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep = getelementptr inbounds i32, ptr %dst, i64 %iv
+  store i32 1, ptr %gep
+  %c.0 = icmp slt i64 %iv, %N
+  br i1 %c.0, label %loop.latch, label %exit
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %d = udiv i64 42, %N
+  %c.1 = icmp slt i64 %iv, %d
+  br i1 %c.1, label %loop.header, label %exit
+
+exit:
+  %p = phi i64 [ 1, %loop.header ], [ 0, %loop.latch], [ 2, %entry ]
+  ret i64 %p
+}
+
+; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
 define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_different_bounds(ptr %dst, i64 %N, i64 %M) {
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_different_bounds(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[M:%.*]]) {
@@ -551,7 +698,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_different_bounds
 ; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -567,7 +714,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_different_bounds
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[D:%.*]] = udiv i64 42, [[M]]
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    ret i64 [[P]]
@@ -593,16 +740,16 @@ exit:
   ret i64 %p
 }
 
-
+; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
 define i64 @multi_exit_4_exit_count_with_udiv_by_frozen_value_in_latch(ptr %dst, i64 %N) {
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_frozen_value_in_latch(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[FR_N:%.*]] = freeze i64 [[N]]
-; CHECK-NEXT:    [[TMP0:%.*]] = udiv i64 42, [[FR_N]]
-; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 42, [[FR_N]]
+; CHECK-NEXT:    [[TMP10:%.*]] = freeze i64 [[TMP2]]
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
-; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SMAX]])
+; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP10]], i64 [[SMAX]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[UMIN]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP3]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -620,7 +767,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_frozen_value_in_latch(ptr %dst,
 ; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -636,7 +783,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_frozen_value_in_latch(ptr %dst,
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[D:%.*]] = udiv i64 42, [[FR_N]]
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    ret i64 [[P]]
@@ -688,7 +835,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_constant_in_latch(ptr %dst, i64
 ; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -704,7 +851,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_constant_in_latch(ptr %dst, i64
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV1]], 1
 ; CHECK-NEXT:    [[D:%.*]] = udiv i64 [[N]], 42
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV1]], [[D]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER1]], label [[EXIT]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER1]], label [[EXIT]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER1]] ], [ 0, [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    ret i64 [[P]]
@@ -750,7 +897,7 @@ define void @single_exit_tc_with_udiv(ptr %dst, i64 %N) {
 ; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -764,7 +911,7 @@ define void @single_exit_tc_with_udiv(ptr %dst, i64 %N) {
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[D:%.*]] = udiv i64 42, [[N]]
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -784,6 +931,7 @@ exit:
   ret void
 }
 
+; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
 define i64 @multi_exit_4_exit_count_with_urem_by_value_in_latch(ptr %dst, i64 %N) {
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_urem_by_value_in_latch(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
@@ -812,7 +960,7 @@ define i64 @multi_exit_4_exit_count_with_urem_by_value_in_latch(ptr %dst, i64 %N
 ; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -828,7 +976,7 @@ define i64 @multi_exit_4_exit_count_with_urem_by_value_in_latch(ptr %dst, i64 %N
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[D:%.*]] = urem i64 42, [[N]]
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    ret i64 [[P]]
@@ -854,6 +1002,7 @@ exit:
   ret i64 %p
 }
 
+; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
 define i64 @multi_exit_4_exit_count_with_urem_by_constant_in_latch(ptr %dst, i64 %N) {
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_urem_by_constant_in_latch(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
@@ -879,7 +1028,7 @@ define i64 @multi_exit_4_exit_count_with_urem_by_constant_in_latch(ptr %dst, i64
 ; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -895,7 +1044,7 @@ define i64 @multi_exit_4_exit_count_with_urem_by_constant_in_latch(ptr %dst, i64
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV1]], 1
 ; CHECK-NEXT:    [[D:%.*]] = urem i64 [[N]], 42
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV1]], [[D]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER1]], label [[EXIT]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER1]], label [[EXIT]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER1]] ], [ 0, [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    ret i64 [[P]]
@@ -1007,10 +1156,10 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch1(ptr %dst, i64 %
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch1(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = udiv i64 42, [[N]]
-; CHECK-NEXT:    [[TMP8:%.*]] = freeze i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 42, [[N]]
+; CHECK-NEXT:    [[TMP10:%.*]] = freeze i64 [[TMP9]]
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
-; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP8]], i64 [[SMAX]])
+; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP10]], i64 [[SMAX]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[UMIN]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP1]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -1028,7 +1177,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch1(ptr %dst, i64 %
 ; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -1045,7 +1194,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch1(ptr %dst, i64 %
 ; CHECK-NEXT:    [[D:%.*]] = udiv i64 42, [[N]]
 ; CHECK-NEXT:    [[X:%.*]] = sub i64 100, [[D]]
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP29:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    ret i64 [[P]]
@@ -1072,6 +1221,119 @@ exit:
   ret i64 %p
 }
 
+define i64 @multi_exit_exit_count_with_udiv_by_0_in_latch(ptr %dst, i64 %N) {
+; CHECK-LABEL: define i64 @multi_exit_exit_count_with_udiv_by_0_in_latch(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i32 1, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[C_0:%.*]] = icmp slt i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[C_0]], label [[LOOP_LATCH]], label [[EXIT:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[D:%.*]] = udiv i64 42, 0
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i64 [[P]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep = getelementptr inbounds i32, ptr %dst, i64 %iv
+  store i32 1, ptr %gep
+  %c.0 = icmp slt i64 %iv, %N
+  br i1 %c.0, label %loop.latch, label %exit
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %d = udiv i64 42, 0
+  %c.1 = icmp slt i64 %iv, %d
+  br i1 %c.1, label %loop.header, label %exit
+
+exit:
+  %p = phi i64 [ 1, %loop.header ], [ 0, %loop.latch]
+  ret i64 %p
+}
+
+; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
+define i64 @multi_exit_count_with_udiv_by_value_in_latch_different_bounds_divisor_non_zero_may_be_poison(ptr %dst, i64 %N, i64 %M) {
+; CHECK-LABEL: define i64 @multi_exit_count_with_udiv_by_value_in_latch_different_bounds_divisor_non_zero_may_be_poison(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[M:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[M_1:%.*]] = call i64 @llvm.umax.i64(i64 [[M]], i64 1)
+; CHECK-NEXT:    [[TMP0:%.*]] = udiv i64 42, [[M_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[TMP0]]
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
+; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SMAX]])
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i32 1, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[C_0:%.*]] = icmp slt i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[C_0]], label [[LOOP_LATCH]], label [[EXIT:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[D:%.*]] = udiv i64 42, [[M_1]]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP31:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i64 [[P]]
+;
+entry:
+  %M.1 = call i64 @llvm.umax.i64(i64 %M, i64 1)
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep = getelementptr inbounds i32, ptr %dst, i64 %iv
+  store i32 1, ptr %gep
+  %c.0 = icmp slt i64 %iv, %N
+  br i1 %c.0, label %loop.latch, label %exit
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %d = udiv i64 42, %M.1
+  %c.1 = icmp slt i64 %iv, %d
+  br i1 %c.1, label %loop.header, label %exit
+
+exit:
+  %p = phi i64 [ 1, %loop.header ], [ 0, %loop.latch]
+  ret i64 %p
+}
+
+declare i64 @llvm.umax.i64(i64, i64)
+
 
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
@@ -1100,4 +1362,10 @@ exit:
 ; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
+; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
+; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
+; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]}
+; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]}
+; CHECK: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]}
+; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]}
 ;.

>From 37ad65ffb6b8b8867e5d58f05ba676211d0da233 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski at arm.com>
Date: Wed, 16 Oct 2024 07:43:49 +0100
Subject: [PATCH 12/82] [mlir][arith] Remove some e2e tests (#112012)

I am removing the recently added integration test for various Arith Ops.
These operations and their lowerings are effectively already verified by
the Arith-to-LLVM conversion tests in:
  * "mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir"

I've noticed that a few variants of `arith.cmpi` were missing in that
file - those are added here as well.

This is a follow-up for this discussion:
  * https://github.com/llvm/llvm-project/pull/92272

See also the recent update to our guidelines on e2e tests in MLIR:
  * https://github.com/llvm/mlir-www/pull/203
---
 .../Conversion/ArithToLLVM/arith-to-llvm.mlir |  40 ++--
 .../Dialect/Arith/CPU/addition.mlir           |  88 ---------
 .../Dialect/Arith/CPU/comparison.mlir         | 174 ------------------
 .../Dialect/Arith/CPU/multiplication.mlir     | 119 ------------
 4 files changed, 26 insertions(+), 395 deletions(-)
 delete mode 100644 mlir/test/Integration/Dialect/Arith/CPU/addition.mlir
 delete mode 100644 mlir/test/Integration/Dialect/Arith/CPU/comparison.mlir
 delete mode 100644 mlir/test/Integration/Dialect/Arith/CPU/multiplication.mlir

diff --git a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
index d3bdbe89a54876..64c40f1aba43bc 100644
--- a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
+++ b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
@@ -46,33 +46,45 @@ func.func @ops(f32, f32, i32, i32, f64) -> (f32, i32) {
   %1 = arith.subi %arg2, %arg3: i32
 // CHECK: = llvm.icmp "slt" %arg2, %1 : i32
   %2 = arith.cmpi slt, %arg2, %1 : i32
+// CHECK: = llvm.icmp "sle" %arg2, %1 : i32
+  %3 = arith.cmpi sle, %arg2, %1 : i32
+// CHECK: = llvm.icmp "sgt" %arg2, %1 : i32
+  %4 = arith.cmpi sgt, %arg2, %1 : i32
+// CHECK: = llvm.icmp "ult" %arg2, %1 : i32
+  %5 = arith.cmpi ult, %arg2, %1 : i32
+// CHECK: = llvm.icmp "ule" %arg2, %1 : i32
+  %6 = arith.cmpi ule, %arg2, %1 : i32
+// CHECK: = llvm.icmp "ugt" %arg2, %1 : i32
+  %7 = arith.cmpi ugt, %arg2, %1 : i32
+// CHECK: = llvm.icmp "eq" %arg2, %1 : i32
+  %8 = arith.cmpi eq, %arg2, %1 : i32
 // CHECK: = llvm.sdiv %arg2, %arg3 : i32
-  %3 = arith.divsi %arg2, %arg3 : i32
+  %9 = arith.divsi %arg2, %arg3 : i32
 // CHECK: = llvm.udiv %arg2, %arg3 : i32
-  %4 = arith.divui %arg2, %arg3 : i32
+  %10 = arith.divui %arg2, %arg3 : i32
 // CHECK: = llvm.srem %arg2, %arg3 : i32
-  %5 = arith.remsi %arg2, %arg3 : i32
+  %11 = arith.remsi %arg2, %arg3 : i32
 // CHECK: = llvm.urem %arg2, %arg3 : i32
-  %6 = arith.remui %arg2, %arg3 : i32
+  %12 = arith.remui %arg2, %arg3 : i32
 // CHECK: = llvm.fdiv %arg0, %arg1 : f32
-  %8 = arith.divf %arg0, %arg1 : f32
+  %13 = arith.divf %arg0, %arg1 : f32
 // CHECK: = llvm.frem %arg0, %arg1 : f32
-  %9 = arith.remf %arg0, %arg1 : f32
+  %14 = arith.remf %arg0, %arg1 : f32
 // CHECK: = llvm.and %arg2, %arg3 : i32
-  %10 = arith.andi %arg2, %arg3 : i32
+  %15 = arith.andi %arg2, %arg3 : i32
 // CHECK: = llvm.or %arg2, %arg3 : i32
-  %11 = arith.ori %arg2, %arg3 : i32
+  %16 = arith.ori %arg2, %arg3 : i32
 // CHECK: = llvm.xor %arg2, %arg3 : i32
-  %12 = arith.xori %arg2, %arg3 : i32
+  %17 = arith.xori %arg2, %arg3 : i32
 // CHECK: = llvm.mlir.constant(7.900000e-01 : f64) : f64
-  %15 = arith.constant 7.9e-01 : f64
+  %18 = arith.constant 7.9e-01 : f64
 // CHECK: = llvm.shl %arg2, %arg3 : i32
-  %16 = arith.shli %arg2, %arg3 : i32
+  %19 = arith.shli %arg2, %arg3 : i32
 // CHECK: = llvm.ashr %arg2, %arg3 : i32
-  %17 = arith.shrsi %arg2, %arg3 : i32
+  %20 = arith.shrsi %arg2, %arg3 : i32
 // CHECK: = llvm.lshr %arg2, %arg3 : i32
-  %18 = arith.shrui %arg2, %arg3 : i32
-  return %0, %4 : f32, i32
+  %21 = arith.shrui %arg2, %arg3 : i32
+  return %0, %10 : f32, i32
 }
 
 // Checking conversion of index types to integers using i1, assuming no target
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/addition.mlir b/mlir/test/Integration/Dialect/Arith/CPU/addition.mlir
deleted file mode 100644
index b6acfd53c1f5d9..00000000000000
--- a/mlir/test/Integration/Dialect/Arith/CPU/addition.mlir
+++ /dev/null
@@ -1,88 +0,0 @@
-// RUN: mlir-opt %s --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \
-// RUN:             --convert-func-to-llvm --convert-arith-to-llvm | \
-// RUN:   mlir-cpu-runner -e entry -entry-point-result=void \
-// RUN:                   --shared-libs=%mlir_c_runner_utils | \
-// RUN:   FileCheck %s --match-full-lines
-
-func.func @addi_i1(%v1 : i1, %v2 : i1) {
-  vector.print str "@addi_i1\n"
-  %res = arith.addi %v1, %v2 : i1
-  vector.print %res : i1
-  return
-}
-
-func.func @addi() {
-  // ------------------------------------------------
-  // Test i1
-  // ------------------------------------------------
-
-  // addi on i1
-  // addi(0, 1) : i1 = 1 : i1; addi(0, -1) : i1 = 1
-  %false = arith.constant 0 : i1
-  %true = arith.constant 1 : i1
-
-  // CHECK-LABEL: @addi_i1
-  // CHECK-NEXT:  1
-  func.call @addi_i1(%false, %true) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @addi_i1
-  // CHECK-NEXT:  1
-  %true_based_on_non_zero_val = arith.constant -1 : i1
-  func.call @addi_i1(%false, %true_based_on_non_zero_val) : (i1, i1) -> ()
-
-  // ------------------------------------------------
-  // TODO: Test i8, i16 etc..
-  // ------------------------------------------------
-
-  return
-}
-
-func.func @addui_extended_i1(%v1 : i1, %v2 : i1) {
-  vector.print str "@addui_extended_i1\n"
-  %res, %overflow = arith.addui_extended %v1, %v2 : i1, i1
-  vector.print %res : i1
-  vector.print %overflow : i1
-  return
-}
-
-func.func @addi_extended() {
-  // ------------------------------------------------
-  // Test i1
-  // ------------------------------------------------
-
-  // addui_extended on i1
-  // addui_extended 1 1 : i1 = 0, 1
-  %true = arith.constant 1 : i1
-  %false = arith.constant 0 : i1
-  
-  // CHECK-LABEL: @addui_extended_i1
-  // CHECK-NEXT:  0
-  // CHECK-NEXT:  1
-  func.call @addui_extended_i1(%true, %true) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @addui_extended_i1
-  // CHECK-NEXT:  1
-  // CHECK-NEXT:  0
-  func.call @addui_extended_i1(%true, %false) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @addui_extended_i1
-  // CHECK-NEXT:  1
-  // CHECK-NEXT:  0
-  func.call @addui_extended_i1(%false, %true) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @addui_extended_i1
-  // CHECK-NEXT:  0
-  // CHECK-NEXT:  0
-  func.call @addui_extended_i1(%false, %false) : (i1, i1) -> ()
-
-  // ------------------------------------------------
-  // TODO: Test i8, i16 etc.. 
-  // ------------------------------------------------
-  return
-}
-
-func.func @entry() {
-  func.call @addi() : () -> ()
-  func.call @addi_extended() : () -> ()
-  return
-}
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/comparison.mlir b/mlir/test/Integration/Dialect/Arith/CPU/comparison.mlir
deleted file mode 100644
index 418fbb0c0a94c7..00000000000000
--- a/mlir/test/Integration/Dialect/Arith/CPU/comparison.mlir
+++ /dev/null
@@ -1,174 +0,0 @@
-// RUN: mlir-opt %s --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \
-// RUN:             --convert-func-to-llvm --convert-arith-to-llvm | \
-// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
-// RUN:                   --shared-libs=%mlir_c_runner_utils | \
-// RUN: FileCheck %s --match-full-lines
-
-func.func @cmpi_eq_i1(%v1 : i1, %v2 : i1) {
-  vector.print str "@cmpi_eq_i1\n"
-  %res = arith.cmpi eq, %v1, %v2 : i1
-  vector.print %res : i1
-  return
-}
-
-func.func @cmpi_slt_i1(%v1 : i1, %v2 : i1) {
-  vector.print str "@cmpi_slt_i1\n"
-  %res = arith.cmpi slt, %v1, %v2 : i1
-  vector.print %res : i1
-  return
-}
-
-func.func @cmpi_sle_i1(%v1 : i1, %v2 : i1) {
-  vector.print str "@cmpi_sle_i1\n"
-  %res = arith.cmpi sle, %v1, %v2 : i1
-  vector.print %res : i1
-  return
-}
-
-func.func @cmpi_sgt_i1(%v1 : i1, %v2 : i1) {
-  vector.print str "@cmpi_sgt_i1\n"
-  %res = arith.cmpi sgt, %v1, %v2 : i1
-  vector.print %res : i1
-  return
-}
-
-func.func @cmpi_sge_i1(%v1 : i1, %v2 : i1) {
-  vector.print str "@cmpi_sge_i1\n"
-  %res = arith.cmpi sge, %v1, %v2 : i1
-  vector.print %res : i1
-  return
-}
-
-func.func @cmpi_eq() {
-  // ------------------------------------------------
-  // Test i1
-  // ------------------------------------------------
-  %false_i1 = arith.constant 0 : i1
-  %true_i1 = arith.constant 1 : i1
-  %true_i1_n1 = arith.constant -1 : i1
-
-  // int values 1 and -1 are represented with the same bitvector (`0b1`)
-  // CHECK-LABEL: @cmpi_eq_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_eq_i1(%true_i1, %true_i1_n1) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_eq_i1
-  // CHECK-NEXT:  0
-  func.call @cmpi_eq_i1(%false_i1, %true_i1) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_eq_i1
-  // CHECK-NEXT:  0
-  func.call @cmpi_eq_i1(%true_i1, %false_i1) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_eq_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_eq_i1(%true_i1, %true_i1) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_eq_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_eq_i1(%false_i1, %false_i1) : (i1, i1) -> ()
-
-  %false = arith.constant false
-  %true = arith.constant true
-
-  // CHECK-LABEL: @cmpi_eq_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_eq_i1(%true, %true_i1) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_eq_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_eq_i1(%false, %false_i1) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_eq_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_eq_i1(%true, %true_i1_n1) : (i1, i1) -> ()
-
-  // ------------------------------------------------
-  // TODO: Test i8, i16 etc..
-  // ------------------------------------------------
-  return
-}
-
-func.func @cmpi_signed() {
-  // ------------------------------------------------
-  // Test i1
-  // ------------------------------------------------
-  %false_i1 = arith.constant 0 : i1
-  %true_i1 = arith.constant 1 : i1
-  %true_i1_n1 = arith.constant -1 : i1
-
-  // int values 1 and -1 are represented with the same bitvector (`0b1`)
-  // But, bitvector `1` is interpreted as int value -1 in signed comparison
-
-  // CHECK-LABEL: @cmpi_sge_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_sge_i1(%false_i1, %true_i1_n1) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_sge_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_sge_i1(%false_i1, %true_i1) : (i1, i1) -> ()
-  
-  // CHECK-LABEL: @cmpi_sge_i1
-  // CHECK-NEXT:  0
-  func.call @cmpi_sge_i1(%true_i1, %false_i1) : (i1, i1) -> ()
-
-  %false = arith.constant false
-  %true = arith.constant true
-
-  // CHECK-LABEL: @cmpi_slt_i1
-  // CHECK-NEXT:  0
-  func.call @cmpi_slt_i1(%false, %true) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_sle_i1
-  // CHECK-NEXT:  0
-  func.call @cmpi_sle_i1(%false, %true) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_sgt_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_sgt_i1(%false, %true) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_sge_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_sge_i1(%false, %true) : (i1, i1) -> ()
-  
-  // CHECK-LABEL: @cmpi_sge_i1
-  // CHECK-NEXT:  0
-  func.call @cmpi_sge_i1(%true, %false) : (i1, i1) -> ()
-  
-  // ------------------------------------------------
-  // TODO: Test i8, i16 etc..
-  // ------------------------------------------------
-  return
-}
-
-func.func @cmpi_ult_index(%v1 : index, %v2 : index) {
-  vector.print str "@cmpi_ult_index\n"
-  %res = arith.cmpi ult, %v1, %v2 : index
-  vector.print %res : i1
-  return
-}
-
-func.func @cmpi_unsigned() {
-  // ------------------------------------------------
-  // Test index
-  // ------------------------------------------------
-  // 0 `ult` -2^63 = true
-  %zero = arith.constant 0 : index
-  %index_min = arith.constant -9223372036854775808 : index
-
-  // CHECK-LABEL: @cmpi_ult_index
-  // CHECK-NEXT: 1
-  func.call @cmpi_ult_index(%zero, %index_min) : (index, index) -> ()
-  
-  // ------------------------------------------------
-  // TODO: i1, i8, i16, uge, ule etc.. 
-  // ------------------------------------------------
-  return
-}
-
-func.func @entry() {
-  func.call @cmpi_eq() : () -> ()
-  func.call @cmpi_signed() : () -> ()
-  func.call @cmpi_unsigned() : () -> ()
-  return
-}
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/multiplication.mlir b/mlir/test/Integration/Dialect/Arith/CPU/multiplication.mlir
deleted file mode 100644
index 21fd816788431e..00000000000000
--- a/mlir/test/Integration/Dialect/Arith/CPU/multiplication.mlir
+++ /dev/null
@@ -1,119 +0,0 @@
-// RUN: mlir-opt %s --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \
-// RUN:             --convert-func-to-llvm --convert-arith-to-llvm | \
-// RUN:   mlir-cpu-runner -e entry -entry-point-result=void \
-// RUN:                   --shared-libs=%mlir_c_runner_utils | \
-// RUN:   FileCheck %s --match-full-lines
-
-func.func @mulsi_extended_i1(%v1 : i1, %v2 : i1) {
-  vector.print str "@mulsi_extended_i1\n"
-  %low, %high = arith.mulsi_extended %v1, %v2 : i1
-  vector.print %low : i1
-  vector.print %high : i1
-  return
-}
-
-func.func @mulsi_extended_i8(%v1 : i8, %v2 : i8) {
-  vector.print str "@mulsi_extended_i8\n"
-  %low, %high = arith.mulsi_extended %v1, %v2 : i8
-  vector.print %low : i8
-  vector.print %high : i8
-  return
-}
-
-func.func @mulsi_extended() {
-  // ------------------------------------------------
-  // Test i1
-  // ------------------------------------------------
-
-  // mulsi_extended on i1, tests for overflow bit
-  // mulsi_extended 1, 1 : i1 = (1, 0)
-  %true = arith.constant true
-  %false = arith.constant false
-
-  // CHECK-LABEL: @mulsi_extended_i1
-  // CHECK-NEXT:  1
-  // CHECK-NEXT:  0
-  func.call @mulsi_extended_i1(%true, %true) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @mulsi_extended_i1
-  // CHECK-NEXT:  0
-  // CHECK-NEXT:  0
-  func.call @mulsi_extended_i1(%true, %false) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @mulsi_extended_i1
-  // CHECK-NEXT:  0
-  // CHECK-NEXT:  0
-  func.call @mulsi_extended_i1(%false, %true) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @mulsi_extended_i1
-  // CHECK-NEXT:  0
-  // CHECK-NEXT:  0
-  func.call @mulsi_extended_i1(%false, %false) : (i1, i1) -> ()
-
-  // ------------------------------------------------
-  // Test i8
-  // ------------------------------------------------
-  // mulsi extended versions, with overflow
-  %c_100_i8 = arith.constant -100 : i8
-
-  // mulsi_extended -100, -100 : i8 = (16, 39)
-  // CHECK-LABEL: @mulsi_extended_i8
-  // CHECK-NEXT:  16
-  // CHECK-NEXT:  39
-  func.call @mulsi_extended_i8(%c_100_i8, %c_100_i8) : (i8, i8) -> ()
-
-  // ------------------------------------------------
-  // TODO: Test i16, i32 etc.. 
-  // ------------------------------------------------
-  return
-}
-
-func.func @mului_extended_i8(%v1 : i8, %v2 : i8) {
-  vector.print str "@mului_extended_i8\n"
-  %low, %high = arith.mului_extended %v1, %v2 : i8
-  vector.print %low : i8
-  vector.print %high : i8
-  return
-}
-
-func.func @mului_extended() {
-  // ------------------------------------------------
-  // Test i8
-  // ------------------------------------------------
-  %c_n100_i8 = arith.constant -100 : i8
-  %c_156_i8 = arith.constant 156 : i8
-
-  // mului_extended -100, -100 : i8 = (16, 95)
-  // and on equivalent representations (e.g. 156 === -100 (mod 256))
-
-  // CHECK-LABEL: @mului_extended_i8
-  // CHECK-NEXT:  16
-  // CHECK-NEXT:  95
-  func.call @mului_extended_i8(%c_n100_i8, %c_n100_i8) : (i8, i8) -> ()
-
-  // CHECK-LABEL: @mului_extended_i8
-  // CHECK-NEXT:  16
-  // CHECK-NEXT:  95
-  func.call @mului_extended_i8(%c_n100_i8, %c_156_i8) : (i8, i8) -> ()
-
-  // CHECK-LABEL: @mului_extended_i8
-  // CHECK-NEXT:  16
-  // CHECK-NEXT:  95
-  func.call @mului_extended_i8(%c_156_i8, %c_n100_i8) : (i8, i8) -> ()
-
-  // CHECK-LABEL: @mului_extended_i8
-  // CHECK-NEXT:  16
-  // CHECK-NEXT:  95
-  func.call @mului_extended_i8(%c_156_i8, %c_156_i8) : (i8, i8) -> ()
-
-  // ------------------------------------------------
-  // TODO: Test i1, i16, i32 etc.. 
-  // ------------------------------------------------
-  return
-}
-
-func.func @entry() {
-  func.call @mulsi_extended() : () -> ()
-  func.call @mului_extended() : () -> ()
-  return
-}

>From e1d205a3855898b413978ee457f37e361ae981bd Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Wed, 16 Oct 2024 00:09:12 -0700
Subject: [PATCH 13/82] [SCCP] Simplify code with DenseMap::operator[] (NFC)
 (#112473)

---
 llvm/lib/Transforms/Utils/SCCPSolver.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 101d60525f4161..c65710ea7551ac 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -630,10 +630,7 @@ class SCCPInstVisitor : public InstVisitor<SCCPInstVisitor> {
   }
 
   // Add U as additional user of V.
-  void addAdditionalUser(Value *V, User *U) {
-    auto Iter = AdditionalUsers.insert({V, {}});
-    Iter.first->second.insert(U);
-  }
+  void addAdditionalUser(Value *V, User *U) { AdditionalUsers[V].insert(U); }
 
   // Mark I's users as changed, including AdditionalUsers.
   void markUsersAsChanged(Value *I) {

>From 14d006c53c67ded7da00e7880c58f2c7e25ee1f1 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Wed, 16 Oct 2024 09:43:16 +0200
Subject: [PATCH 14/82] AMDGPU/GlobalISel: Run redundant_and combine in
 RegBankCombiner (#112353)

Combine is needed to clear redundant ANDs with 1 that will be
created by reg-bank-select to clean-up high bits in register.
Fix replaceRegWith from CombinerHelper:
If copy had to be inserted, first create copy then delete MI.
If MI is deleted first insert point is not valid.
---
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  12 +-
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td       |   3 +-
 .../GlobalISel/artifact-combiner-asserts.ll   |   4 +-
 .../regbankcombiner-redundant-and.mir         |  28 +++
 llvm/test/CodeGen/AMDGPU/fptoi.i128.ll        |  12 +-
 llvm/test/CodeGen/AMDGPU/itofp.i128.ll        | 166 +++++++++---------
 6 files changed, 125 insertions(+), 100 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-redundant-and.mir

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 14e94d48bf8362..f9b1621955c217 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -178,7 +178,7 @@ void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
   if (MRI.constrainRegAttrs(ToReg, FromReg))
     MRI.replaceRegWith(FromReg, ToReg);
   else
-    Builder.buildCopy(ToReg, FromReg);
+    Builder.buildCopy(FromReg, ToReg);
 
   Observer.finishedChangingAllUsesOfReg();
 }
@@ -229,8 +229,8 @@ bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
 void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
-  MI.eraseFromParent();
   replaceRegWith(MRI, DstReg, SrcReg);
+  MI.eraseFromParent();
 }
 
 bool CombinerHelper::matchFreezeOfSingleMaybePoisonOperand(
@@ -379,8 +379,8 @@ void CombinerHelper::applyCombineConcatVectors(MachineInstr &MI,
     Builder.buildUndef(NewDstReg);
   else
     Builder.buildBuildVector(NewDstReg, Ops);
-  MI.eraseFromParent();
   replaceRegWith(MRI, DstReg, NewDstReg);
+  MI.eraseFromParent();
 }
 
 bool CombinerHelper::matchCombineShuffleConcat(MachineInstr &MI,
@@ -559,8 +559,8 @@ void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI,
   else
     Builder.buildMergeLikeInstr(NewDstReg, Ops);
 
-  MI.eraseFromParent();
   replaceRegWith(MRI, DstReg, NewDstReg);
+  MI.eraseFromParent();
 }
 
 bool CombinerHelper::matchShuffleToExtract(MachineInstr &MI) {
@@ -2825,8 +2825,8 @@ void CombinerHelper::replaceSingleDefInstWithOperand(MachineInstr &MI,
   Register OldReg = MI.getOperand(0).getReg();
   Register Replacement = MI.getOperand(OpIdx).getReg();
   assert(canReplaceReg(OldReg, Replacement, MRI) && "Cannot replace register?");
-  MI.eraseFromParent();
   replaceRegWith(MRI, OldReg, Replacement);
+  MI.eraseFromParent();
 }
 
 void CombinerHelper::replaceSingleDefInstWithReg(MachineInstr &MI,
@@ -2834,8 +2834,8 @@ void CombinerHelper::replaceSingleDefInstWithReg(MachineInstr &MI,
   assert(MI.getNumExplicitDefs() == 1 && "Expected one explicit def?");
   Register OldReg = MI.getOperand(0).getReg();
   assert(canReplaceReg(OldReg, Replacement, MRI) && "Cannot replace register?");
-  MI.eraseFromParent();
   replaceRegWith(MRI, OldReg, Replacement);
+  MI.eraseFromParent();
 }
 
 bool CombinerHelper::matchConstantLargerBitWidth(MachineInstr &MI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b2a3f9392157d1..985fa8f1deff94 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -169,5 +169,6 @@ def AMDGPURegBankCombiner : GICombiner<
   "AMDGPURegBankCombinerImpl",
   [unmerge_merge, unmerge_cst, unmerge_undef,
    zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
-   fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
+   fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
+   redundant_and]> {
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll
index 6dce6c1852af9b..6e4fb2678b382d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll
@@ -27,10 +27,8 @@ define hidden <2 x i64> @icmp_v2i32_zext_to_v2i64(<2 x i32> %arg) {
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; CHECK-NEXT:    v_and_b32_e32 v2, 1, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %arg, zeroinitializer
   %sext = zext <2 x i1> %cmp to <2 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-redundant-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-redundant-and.mir
new file mode 100644
index 00000000000000..f87a253dcb4333
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-redundant-and.mir
@@ -0,0 +1,28 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: replaceRegWith_requires_copy
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: replaceRegWith_requires_copy
+    ; CHECK: liveins: $sgpr0, $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sreg_32(s32) = G_ICMP intpred(ne), [[COPY1]](s32), [[C]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY [[ICMP]](s32)
+    ; CHECK-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:sgpr(p1) = COPY $vgpr0_vgpr1
+    %1:sgpr(s32) = COPY $sgpr0
+    %2:sgpr(s32) = G_CONSTANT i32 1
+    %3:sreg_32(s32) = G_ICMP intpred(ne), %1, %2
+    %4:sgpr(s32) = G_AND %3, %2
+    G_STORE %4(s32), %0(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 6e8e6c07217895..786fe03164690e 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -136,12 +136,12 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v5, v1
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 20, v5
-; GISEL-NEXT:    v_and_b32_e32 v6, 0x7ff, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 20, v5
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0x3ff
 ; GISEL-NEXT:    s_mov_b64 s[4:5], 0
-; GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_and_b32_e32 v6, 0x7ff, v2
 ; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
 ; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -508,12 +508,12 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v5, v1
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 20, v5
-; GISEL-NEXT:    v_and_b32_e32 v6, 0x7ff, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 20, v5
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0x3ff
 ; GISEL-NEXT:    s_mov_b64 s[4:5], 0
-; GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_and_b32_e32 v6, 0x7ff, v2
 ; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
 ; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s4
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
index 38d928a006fb20..2999ddb8315883 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -673,38 +673,38 @@ define double @sitofp_i128_to_f64(i128 %x) {
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v6, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v6, v5
-; GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_xor_b32_e32 v2, v6, v2
-; GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v3, v6, v3
-; GISEL-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
-; GISEL-NEXT:    v_ffbh_u32_e32 v5, v0
-; GISEL-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
-; GISEL-NEXT:    v_ffbh_u32_e32 v4, v1
-; GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
-; GISEL-NEXT:    v_ffbh_u32_e32 v7, v2
-; GISEL-NEXT:    v_min_u32_e32 v4, v4, v5
-; GISEL-NEXT:    v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT:    v_xor_b32_e32 v4, v6, v2
+; GISEL-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v6
+; GISEL-NEXT:    v_xor_b32_e32 v5, v6, v3
+; GISEL-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v6, vcc
+; GISEL-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v6, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v1, v2
+; GISEL-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v6, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v0, v3
+; GISEL-NEXT:    v_add_u32_e32 v1, 32, v1
+; GISEL-NEXT:    v_ffbh_u32_e32 v7, v4
+; GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
+; GISEL-NEXT:    v_ffbh_u32_e32 v1, v5
 ; GISEL-NEXT:    v_add_u32_e32 v7, 32, v7
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GISEL-NEXT:    v_add_u32_e32 v4, 64, v4
-; GISEL-NEXT:    v_min_u32_e32 v5, v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v5, v4, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT:    v_add_u32_e32 v0, 64, v0
+; GISEL-NEXT:    v_min_u32_e32 v1, v1, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v1, v0, vcc
 ; GISEL-NEXT:    v_sub_u32_e32 v8, 0x80, v9
 ; GISEL-NEXT:    v_sub_u32_e32 v7, 0x7f, v9
 ; GISEL-NEXT:    v_cmp_ge_i32_e32 vcc, 53, v8
 ; GISEL-NEXT:    ; implicit-def: $vgpr10
-; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GISEL-NEXT:  ; %bb.2: ; %itofp-if-else
-; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffffb5, v9
-; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT:    v_add_u32_e32 v4, 0xffffffb5, v9
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v4, v[2:3]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v1, vcc
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
-; GISEL-NEXT:    ; implicit-def: $vgpr0
+; GISEL-NEXT:    ; implicit-def: $vgpr2
 ; GISEL-NEXT:    ; implicit-def: $vgpr9
 ; GISEL-NEXT:  ; %bb.3: ; %Flow3
 ; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
@@ -721,89 +721,88 @@ define double @sitofp_i128_to_f64(i128 %x) {
 ; GISEL-NEXT:  ; %bb.6: ; %itofp-sw-default
 ; GISEL-NEXT:    v_sub_u32_e32 v14, 0x49, v9
 ; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v14
-; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v14, v[0:1]
-; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v14, v[2:3]
+; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, v[4:5]
 ; GISEL-NEXT:    v_subrev_u32_e32 v15, 64, v14
-; GISEL-NEXT:    v_or_b32_e32 v10, v4, v10
-; GISEL-NEXT:    v_or_b32_e32 v11, v5, v11
-; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v15, v[2:3]
-; GISEL-NEXT:    v_lshrrev_b64 v[12:13], v14, v[2:3]
+; GISEL-NEXT:    v_lshrrev_b64 v[12:13], v14, v[4:5]
+; GISEL-NEXT:    v_or_b32_e32 v10, v0, v10
+; GISEL-NEXT:    v_or_b32_e32 v11, v1, v11
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v15, v[4:5]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; GISEL-NEXT:    v_add_u32_e32 v9, 55, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
-; GISEL-NEXT:    v_add_u32_e32 v14, 55, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
-; GISEL-NEXT:    v_sub_u32_e32 v11, 64, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, v4, v0, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v5, v1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v12, vcc
-; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v14, -1
-; GISEL-NEXT:    v_lshlrev_b64 v[11:12], v11, -1
-; GISEL-NEXT:    v_subrev_u32_e32 v15, 64, v14
-; GISEL-NEXT:    v_or_b32_e32 v16, v9, v11
-; GISEL-NEXT:    v_or_b32_e32 v17, v10, v12
-; GISEL-NEXT:    v_lshrrev_b64 v[11:12], v15, -1
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v16, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v17, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, v11, -1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v12, -1, s[4:5]
-; GISEL-NEXT:    v_and_b32_e32 v2, v9, v2
-; GISEL-NEXT:    v_and_b32_e32 v3, v10, v3
-; GISEL-NEXT:    v_and_or_b32 v0, v11, v0, v2
-; GISEL-NEXT:    v_and_or_b32 v1, v12, v1, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v12, vcc
+; GISEL-NEXT:    v_sub_u32_e32 v12, 64, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, v0, v2, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, v1, v3, s[4:5]
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v9, -1
+; GISEL-NEXT:    v_lshlrev_b64 v[12:13], v12, -1
+; GISEL-NEXT:    v_subrev_u32_e32 v15, 64, v9
+; GISEL-NEXT:    v_or_b32_e32 v16, v0, v12
+; GISEL-NEXT:    v_or_b32_e32 v17, v1, v13
+; GISEL-NEXT:    v_lshrrev_b64 v[12:13], v15, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, v12, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, v13, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
+; GISEL-NEXT:    v_and_or_b32 v0, v9, v2, v0
+; GISEL-NEXT:    v_and_or_b32 v1, v12, v3, v1
 ; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_or_b32_e32 v3, v13, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mov_b32_e32 v1, v4
-; GISEL-NEXT:    v_mov_b32_e32 v2, v5
-; GISEL-NEXT:    v_mov_b32_e32 v3, v6
+; GISEL-NEXT:    v_or_b32_e32 v9, v14, v0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v9
+; GISEL-NEXT:    v_mov_b32_e32 v3, v10
+; GISEL-NEXT:    v_mov_b32_e32 v4, v11
+; GISEL-NEXT:    v_mov_b32_e32 v5, v12
 ; GISEL-NEXT:  .LBB2_7: ; %Flow1
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
 ; GISEL-NEXT:  .LBB2_8: ; %Flow2
 ; GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
 ; GISEL-NEXT:    s_cbranch_execz .LBB2_10
 ; GISEL-NEXT:  ; %bb.9: ; %itofp-sw-bb
-; GISEL-NEXT:    v_lshlrev_b64 v[9:10], 1, v[0:1]
-; GISEL-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
-; GISEL-NEXT:    v_or_b32_e32 v11, v2, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v9
-; GISEL-NEXT:    v_mov_b32_e32 v1, v10
-; GISEL-NEXT:    v_mov_b32_e32 v2, v11
-; GISEL-NEXT:    v_mov_b32_e32 v3, v12
+; GISEL-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], 1, v[2:3]
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 31, v3
+; GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
+; GISEL-NEXT:    v_mov_b32_e32 v5, v3
+; GISEL-NEXT:    v_mov_b32_e32 v4, v2
+; GISEL-NEXT:    v_mov_b32_e32 v3, v1
+; GISEL-NEXT:    v_mov_b32_e32 v2, v0
 ; GISEL-NEXT:  .LBB2_10: ; %itofp-sw-epilog
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT:    v_bfe_u32 v3, v0, 2, 1
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
-; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
-; GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT:    v_bfe_u32 v0, v2, 2, 1
+; GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GISEL-NEXT:    v_add_co_u32_e32 v2, vcc, 1, v0
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], 2, v[2:3]
 ; GISEL-NEXT:    v_mov_b32_e32 v9, 0
-; GISEL-NEXT:    v_and_b32_e32 v10, 0x800000, v1
+; GISEL-NEXT:    v_and_b32_e32 v10, 0x800000, v3
 ; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[9:10]
-; GISEL-NEXT:    v_lshl_or_b32 v10, v2, 30, v5
+; GISEL-NEXT:    v_lshl_or_b32 v10, v4, 30, v1
 ; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GISEL-NEXT:  ; %bb.11: ; %itofp-if-then20
-; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], 3, v[2:3]
 ; GISEL-NEXT:    v_mov_b32_e32 v7, v8
-; GISEL-NEXT:    v_lshl_or_b32 v10, v2, 29, v5
+; GISEL-NEXT:    v_lshl_or_b32 v10, v4, 29, v1
 ; GISEL-NEXT:  ; %bb.12: ; %Flow
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GISEL-NEXT:  .LBB2_13: ; %Flow4
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GISEL-NEXT:    v_and_b32_e32 v0, 0x80000000, v6
-; GISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0xfffff
-; GISEL-NEXT:    v_lshl_add_u32 v1, v7, 20, v1
-; GISEL-NEXT:    v_and_or_b32 v2, v10, v2, v0
-; GISEL-NEXT:    v_and_or_b32 v0, v4, -1, 0
-; GISEL-NEXT:    v_or3_b32 v1, v2, v1, 0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0x80000000, v6
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x3ff00000
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0xfffff
+; GISEL-NEXT:    v_lshl_add_u32 v2, v7, 20, v2
+; GISEL-NEXT:    v_and_or_b32 v1, v10, v3, v1
+; GISEL-NEXT:    v_or3_b32 v1, v1, v2, 0
 ; GISEL-NEXT:  .LBB2_14: ; %Flow5
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1083,7 +1082,6 @@ define double @uitofp_i128_to_f64(i128 %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0x3ff00000
 ; GISEL-NEXT:    v_lshl_add_u32 v0, v6, 20, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff, v9
-; GISEL-NEXT:    v_and_or_b32 v4, v4, -1, 0
 ; GISEL-NEXT:    v_or3_b32 v5, v1, v0, 0
 ; GISEL-NEXT:  .LBB3_14: ; %Flow5
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]

>From eccf4d44d346eee498b0ff709e625e3104448751 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <christudasan.devadasan at amd.com>
Date: Wed, 16 Oct 2024 13:16:55 +0530
Subject: [PATCH 15/82] [CodeGen] Remove unused MachineBranchProbabilityInfo
 from MachineTraceMetrics pass(NFC). (#108506)

---
 llvm/lib/CodeGen/MachineTraceMetrics.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index bf3add010574b8..5a1670953ddd97 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -14,7 +14,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -44,9 +43,8 @@ char MachineTraceMetrics::ID = 0;
 
 char &llvm::MachineTraceMetricsID = MachineTraceMetrics::ID;
 
-INITIALIZE_PASS_BEGIN(MachineTraceMetrics, DEBUG_TYPE,
-                      "Machine Trace Metrics", false, true)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_BEGIN(MachineTraceMetrics, DEBUG_TYPE, "Machine Trace Metrics",
+                      false, true)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
 INITIALIZE_PASS_END(MachineTraceMetrics, DEBUG_TYPE,
                     "Machine Trace Metrics", false, true)
@@ -57,7 +55,6 @@ MachineTraceMetrics::MachineTraceMetrics() : MachineFunctionPass(ID) {
 
 void MachineTraceMetrics::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
   AU.addRequired<MachineLoopInfoWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }

>From 732b804e5f0fd3d5e267c7f39fedc6525ebda3ba Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <christudasan.devadasan at amd.com>
Date: Wed, 16 Oct 2024 13:19:55 +0530
Subject: [PATCH 16/82] [CodeGen][NewPM] Port machine trace metrics analysis to
 new pass manager. (#108507)

---
 .../llvm/CodeGen/MachineTraceMetrics.h        | 65 +++++++++++++---
 llvm/include/llvm/InitializePasses.h          |  2 +-
 .../llvm/Passes/MachinePassRegistry.def       |  4 +-
 llvm/lib/CodeGen/EarlyIfConversion.cpp        |  8 +-
 llvm/lib/CodeGen/MachineCombiner.cpp          |  8 +-
 llvm/lib/CodeGen/MachineTraceMetrics.cpp      | 78 +++++++++++++------
 llvm/lib/Passes/PassBuilder.cpp               |  1 +
 .../AArch64/AArch64ConditionalCompares.cpp    |  8 +-
 .../AArch64/AArch64StorePairSuppress.cpp      |  6 +-
 9 files changed, 130 insertions(+), 50 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineTraceMetrics.h b/llvm/include/llvm/CodeGen/MachineTraceMetrics.h
index c7d97597d551cd..d51de24d64e8d7 100644
--- a/llvm/include/llvm/CodeGen/MachineTraceMetrics.h
+++ b/llvm/include/llvm/CodeGen/MachineTraceMetrics.h
@@ -46,12 +46,13 @@
 #ifndef LLVM_CODEGEN_MACHINETRACEMETRICS_H
 #define LLVM_CODEGEN_MACHINETRACEMETRICS_H
 
-#include "llvm/ADT/SparseSet.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 
 namespace llvm {
@@ -93,7 +94,7 @@ enum class MachineTraceStrategy {
   TS_NumStrategies
 };
 
-class MachineTraceMetrics : public MachineFunctionPass {
+class MachineTraceMetrics {
   const MachineFunction *MF = nullptr;
   const TargetInstrInfo *TII = nullptr;
   const TargetRegisterInfo *TRI = nullptr;
@@ -102,19 +103,25 @@ class MachineTraceMetrics : public MachineFunctionPass {
   TargetSchedModel SchedModel;
 
 public:
+  friend class MachineTraceMetricsWrapperPass;
   friend class Ensemble;
   friend class Trace;
 
   class Ensemble;
 
-  static char ID;
+  // For legacy pass.
+  MachineTraceMetrics() = default;
+
+  explicit MachineTraceMetrics(MachineFunction &MF, const MachineLoopInfo &LI) {
+    init(MF, LI);
+  }
 
-  MachineTraceMetrics();
+  MachineTraceMetrics(MachineTraceMetrics &&) = default;
 
-  void getAnalysisUsage(AnalysisUsage&) const override;
-  bool runOnMachineFunction(MachineFunction&) override;
-  void releaseMemory() override;
-  void verifyAnalysis() const override;
+  ~MachineTraceMetrics();
+
+  void init(MachineFunction &Func, const MachineLoopInfo &LI);
+  void clear();
 
   /// Per-basic block information that doesn't depend on the trace through the
   /// block.
@@ -400,6 +407,12 @@ class MachineTraceMetrics : public MachineFunctionPass {
   /// Call Ensemble::getTrace() again to update any trace handles.
   void invalidate(const MachineBasicBlock *MBB);
 
+  /// Handle invalidation explicitly.
+  bool invalidate(MachineFunction &, const PreservedAnalyses &PA,
+                  MachineFunctionAnalysisManager::Invalidator &);
+
+  void verifyAnalysis() const;
+
 private:
   // One entry per basic block, indexed by block number.
   SmallVector<FixedBlockInfo, 4> BlockInfo;
@@ -412,8 +425,8 @@ class MachineTraceMetrics : public MachineFunctionPass {
   SmallVector<unsigned, 0> ProcReleaseAtCycles;
 
   // One ensemble per strategy.
-  Ensemble
-      *Ensembles[static_cast<size_t>(MachineTraceStrategy::TS_NumStrategies)];
+  std::unique_ptr<Ensemble>
+      Ensembles[static_cast<size_t>(MachineTraceStrategy::TS_NumStrategies)];
 
   // Convert scaled resource usage to a cycle count that can be compared with
   // latencies.
@@ -435,6 +448,38 @@ inline raw_ostream &operator<<(raw_ostream &OS,
   return OS;
 }
 
+class MachineTraceMetricsAnalysis
+    : public AnalysisInfoMixin<MachineTraceMetricsAnalysis> {
+  friend AnalysisInfoMixin<MachineTraceMetricsAnalysis>;
+  static AnalysisKey Key;
+
+public:
+  using Result = MachineTraceMetrics;
+  Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM);
+};
+
+/// Verifier pass for \c MachineTraceMetrics.
+struct MachineTraceMetricsVerifierPass
+    : PassInfoMixin<MachineTraceMetricsVerifierPass> {
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+  static bool isRequired() { return true; }
+};
+
+class MachineTraceMetricsWrapperPass : public MachineFunctionPass {
+public:
+  static char ID;
+  MachineTraceMetrics MTM;
+
+  MachineTraceMetricsWrapperPass();
+
+  void getAnalysisUsage(AnalysisUsage &) const override;
+  bool runOnMachineFunction(MachineFunction &) override;
+  void releaseMemory() override { MTM.clear(); }
+  void verifyAnalysis() const override { MTM.verifyAnalysis(); }
+  MachineTraceMetrics &getMTM() { return MTM; }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_MACHINETRACEMETRICS_H
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 6a75dc0285cc61..5ed0ad98a2a72d 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -209,7 +209,7 @@ void initializeMachineRegionInfoPassPass(PassRegistry &);
 void initializeMachineSanitizerBinaryMetadataPass(PassRegistry &);
 void initializeMachineSchedulerPass(PassRegistry &);
 void initializeMachineSinkingPass(PassRegistry &);
-void initializeMachineTraceMetricsPass(PassRegistry &);
+void initializeMachineTraceMetricsWrapperPassPass(PassRegistry &);
 void initializeMachineUniformityInfoPrinterPassPass(PassRegistry &);
 void initializeMachineUniformityAnalysisPassPass(PassRegistry &);
 void initializeMachineVerifierLegacyPassPass(PassRegistry &);
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 2aa5f4fc176aba..1d7084354455c7 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -106,6 +106,7 @@ MACHINE_FUNCTION_ANALYSIS("machine-opt-remark-emitter",
                           MachineOptimizationRemarkEmitterAnalysis())
 MACHINE_FUNCTION_ANALYSIS("machine-post-dom-tree",
                           MachinePostDominatorTreeAnalysis())
+MACHINE_FUNCTION_ANALYSIS("machine-trace-metrics", MachineTraceMetricsAnalysis())
 MACHINE_FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
 MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis())
 // MACHINE_FUNCTION_ANALYSIS("live-stacks", LiveStacksPass())
@@ -119,8 +120,6 @@ MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis())
 // MachinePostDominatorTreeAnalysis())
 // MACHINE_FUNCTION_ANALYSIS("machine-region-info",
 // MachineRegionInfoPassAnalysis())
-// MACHINE_FUNCTION_ANALYSIS("machine-trace-metrics",
-// MachineTraceMetricsAnalysis()) MACHINE_FUNCTION_ANALYSIS("reaching-def",
 // ReachingDefAnalysisAnalysis()) MACHINE_FUNCTION_ANALYSIS("live-reg-matrix",
 // LiveRegMatrixAnalysis()) MACHINE_FUNCTION_ANALYSIS("gc-analysis",
 // GCMachineCodeAnalysisPass())
@@ -156,6 +155,7 @@ MACHINE_FUNCTION_PASS("stack-coloring", StackColoringPass())
 MACHINE_FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
 MACHINE_FUNCTION_PASS("two-address-instruction", TwoAddressInstructionPass())
 MACHINE_FUNCTION_PASS("verify", MachineVerifierPass())
+MACHINE_FUNCTION_PASS("verify<machine-trace-metrics>", MachineTraceMetricsVerifierPass())
 #undef MACHINE_FUNCTION_PASS
 
 #ifndef MACHINE_FUNCTION_PASS_WITH_PARAMS
diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp
index 8d9813edd7e52a..53cf6a5169795f 100644
--- a/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -792,7 +792,7 @@ INITIALIZE_PASS_BEGIN(EarlyIfConverter, DEBUG_TYPE,
                       "Early If Converter", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass)
 INITIALIZE_PASS_END(EarlyIfConverter, DEBUG_TYPE,
                     "Early If Converter", false, false)
 
@@ -802,8 +802,8 @@ void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineLoopInfoWrapperPass>();
   AU.addPreserved<MachineLoopInfoWrapperPass>();
-  AU.addRequired<MachineTraceMetrics>();
-  AU.addPreserved<MachineTraceMetrics>();
+  AU.addRequired<MachineTraceMetricsWrapperPass>();
+  AU.addPreserved<MachineTraceMetricsWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -1093,7 +1093,7 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
-  Traces = &getAnalysis<MachineTraceMetrics>();
+  Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
   MinInstr = nullptr;
 
   bool Changed = false;
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index 1a19e053d30fe1..5bfc1d63ac3764 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -133,7 +133,7 @@ char &llvm::MachineCombinerID = MachineCombiner::ID;
 INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE,
                       "Machine InstCombiner", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass)
 INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
                     false, false)
 
@@ -142,8 +142,8 @@ void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineLoopInfoWrapperPass>();
   AU.addPreserved<MachineLoopInfoWrapperPass>();
-  AU.addRequired<MachineTraceMetrics>();
-  AU.addPreserved<MachineTraceMetrics>();
+  AU.addRequired<MachineTraceMetricsWrapperPass>();
+  AU.addPreserved<MachineTraceMetricsWrapperPass>();
   AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
   AU.addRequired<ProfileSummaryInfoWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -727,7 +727,7 @@ bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
   TSchedModel.init(STI);
   MRI = &MF.getRegInfo();
   MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
-  Traces = &getAnalysis<MachineTraceMetrics>();
+  Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   MBFI = (PSI && PSI->hasProfileSummary()) ?
          &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index 5a1670953ddd97..92df6b9ab48d7c 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -39,47 +39,66 @@ using namespace llvm;
 
 #define DEBUG_TYPE "machine-trace-metrics"
 
-char MachineTraceMetrics::ID = 0;
+AnalysisKey MachineTraceMetricsAnalysis::Key;
 
-char &llvm::MachineTraceMetricsID = MachineTraceMetrics::ID;
+MachineTraceMetricsAnalysis::Result
+MachineTraceMetricsAnalysis::run(MachineFunction &MF,
+                                 MachineFunctionAnalysisManager &MFAM) {
+  return Result(MF, MFAM.getResult<MachineLoopAnalysis>(MF));
+}
+
+PreservedAnalyses
+MachineTraceMetricsVerifierPass::run(MachineFunction &MF,
+                                     MachineFunctionAnalysisManager &MFAM) {
+  MFAM.getResult<MachineTraceMetricsAnalysis>(MF).verifyAnalysis();
+  return PreservedAnalyses::all();
+}
 
-INITIALIZE_PASS_BEGIN(MachineTraceMetrics, DEBUG_TYPE, "Machine Trace Metrics",
-                      false, true)
+char MachineTraceMetricsWrapperPass::ID = 0;
+
+char &llvm::MachineTraceMetricsID = MachineTraceMetricsWrapperPass::ID;
+
+INITIALIZE_PASS_BEGIN(MachineTraceMetricsWrapperPass, DEBUG_TYPE,
+                      "Machine Trace Metrics", false, true)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
-INITIALIZE_PASS_END(MachineTraceMetrics, DEBUG_TYPE,
+INITIALIZE_PASS_END(MachineTraceMetricsWrapperPass, DEBUG_TYPE,
                     "Machine Trace Metrics", false, true)
 
-MachineTraceMetrics::MachineTraceMetrics() : MachineFunctionPass(ID) {
-  std::fill(std::begin(Ensembles), std::end(Ensembles), nullptr);
-}
+MachineTraceMetricsWrapperPass::MachineTraceMetricsWrapperPass()
+    : MachineFunctionPass(ID) {}
 
-void MachineTraceMetrics::getAnalysisUsage(AnalysisUsage &AU) const {
+void MachineTraceMetricsWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired<MachineLoopInfoWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) {
+void MachineTraceMetrics::init(MachineFunction &Func,
+                               const MachineLoopInfo &LI) {
   MF = &Func;
   const TargetSubtargetInfo &ST = MF->getSubtarget();
   TII = ST.getInstrInfo();
   TRI = ST.getRegisterInfo();
   MRI = &MF->getRegInfo();
-  Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+  Loops = &LI;
   SchedModel.init(&ST);
   BlockInfo.resize(MF->getNumBlockIDs());
   ProcReleaseAtCycles.resize(MF->getNumBlockIDs() *
                             SchedModel.getNumProcResourceKinds());
+}
+
+bool MachineTraceMetricsWrapperPass::runOnMachineFunction(MachineFunction &MF) {
+  MTM.init(MF, getAnalysis<MachineLoopInfoWrapperPass>().getLI());
   return false;
 }
 
-void MachineTraceMetrics::releaseMemory() {
+MachineTraceMetrics::~MachineTraceMetrics() { clear(); }
+
+void MachineTraceMetrics::clear() {
   MF = nullptr;
   BlockInfo.clear();
-  for (Ensemble *&E : Ensembles) {
-    delete E;
-    E = nullptr;
-  }
+  for (auto &E : Ensembles)
+    E.reset();
 }
 
 //===----------------------------------------------------------------------===//
@@ -395,35 +414,50 @@ MachineTraceMetrics::Ensemble *
 MachineTraceMetrics::getEnsemble(MachineTraceStrategy strategy) {
   assert(strategy < MachineTraceStrategy::TS_NumStrategies &&
          "Invalid trace strategy enum");
-  Ensemble *&E = Ensembles[static_cast<size_t>(strategy)];
+  std::unique_ptr<MachineTraceMetrics::Ensemble> &E =
+      Ensembles[static_cast<size_t>(strategy)];
   if (E)
-    return E;
+    return E.get();
 
   // Allocate new Ensemble on demand.
   switch (strategy) {
   case MachineTraceStrategy::TS_MinInstrCount:
-    return (E = new MinInstrCountEnsemble(this));
+    E = std::make_unique<MinInstrCountEnsemble>(MinInstrCountEnsemble(this));
+    break;
   case MachineTraceStrategy::TS_Local:
-    return (E = new LocalEnsemble(this));
+    E = std::make_unique<LocalEnsemble>(LocalEnsemble(this));
+    break;
   default: llvm_unreachable("Invalid trace strategy enum");
   }
+  return E.get();
 }
 
 void MachineTraceMetrics::invalidate(const MachineBasicBlock *MBB) {
   LLVM_DEBUG(dbgs() << "Invalidate traces through " << printMBBReference(*MBB)
                     << '\n');
   BlockInfo[MBB->getNumber()].invalidate();
-  for (Ensemble *E : Ensembles)
+  for (auto &E : Ensembles)
     if (E)
       E->invalidate(MBB);
 }
 
+bool MachineTraceMetrics::invalidate(
+    MachineFunction &, const PreservedAnalyses &PA,
+    MachineFunctionAnalysisManager::Invalidator &) {
+  // Check whether the analysis, all analyses on machine functions, or the
+  // machine function's CFG have been preserved.
+  auto PAC = PA.getChecker<MachineTraceMetricsAnalysis>();
+  return !PAC.preserved() &&
+         !PAC.preservedSet<AllAnalysesOn<MachineFunction>>() &&
+         !PAC.preservedSet<CFGAnalyses>();
+}
+
 void MachineTraceMetrics::verifyAnalysis() const {
   if (!MF)
     return;
 #ifndef NDEBUG
   assert(BlockInfo.size() == MF->getNumBlockIDs() && "Outdated BlockInfo size");
-  for (Ensemble *E : Ensembles)
+  for (auto &E : Ensembles)
     if (E)
       E->verify();
 #endif
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 36c0cea3613105..08cc4ddbe34327 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -109,6 +109,7 @@
 #include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
 #include "llvm/CodeGen/MachineVerifier.h"
 #include "llvm/CodeGen/PHIElimination.h"
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 9669a393bc2b94..0301032e849772 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -795,7 +795,7 @@ INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
                       "AArch64 CCMP Pass", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass)
 INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
                     "AArch64 CCMP Pass", false, false)
 
@@ -809,8 +809,8 @@ void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineLoopInfoWrapperPass>();
   AU.addPreserved<MachineLoopInfoWrapperPass>();
-  AU.addRequired<MachineTraceMetrics>();
-  AU.addPreserved<MachineTraceMetrics>();
+  AU.addRequired<MachineTraceMetricsWrapperPass>();
+  AU.addPreserved<MachineTraceMetricsWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -937,7 +937,7 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
   DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
   MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
-  Traces = &getAnalysis<MachineTraceMetrics>();
+  Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
   MinInstr = nullptr;
   MinSize = MF.getFunction().hasMinSize();
 
diff --git a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 047e38261a6fcd..d8c8b17565abb9 100644
--- a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -53,8 +53,8 @@ class AArch64StorePairSuppress : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<MachineTraceMetrics>();
-    AU.addPreserved<MachineTraceMetrics>();
+    AU.addRequired<MachineTraceMetricsWrapperPass>();
+    AU.addPreserved<MachineTraceMetricsWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -139,7 +139,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
   TRI = ST.getRegisterInfo();
   MRI = &MF.getRegInfo();
   SchedModel.init(&ST);
-  Traces = &getAnalysis<MachineTraceMetrics>();
+  Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
   MinInstr = nullptr;
 
   LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << MF.getName() << '\n');

>From e36b22f3bf45a23d31b569e53d22b98714cf00e3 Mon Sep 17 00:00:00 2001
From: Howard Roark <howard.roark at huawei-partners.com>
Date: Wed, 16 Oct 2024 10:50:48 +0300
Subject: [PATCH 17/82] Revert "[PGO] Preserve analysis results when nothing
 was instrumented (#93421)"

This reverts commit 23c64beeccc03c6a8329314ecd75864e09bb6d97.
---
 .../Instrumentation/PGOInstrumentation.cpp    |  4 +-
 .../PGOInstrumentationTest.cpp                | 68 +++++--------------
 2 files changed, 19 insertions(+), 53 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index dbe908bb5e72f3..e6e474ed376069 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1916,7 +1916,6 @@ static bool InstrumentAllFunctions(
   std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
   collectComdatMembers(M, ComdatMembers);
 
-  bool AnythingInstrumented = false;
   for (auto &F : M) {
     if (skipPGOGen(F))
       continue;
@@ -1926,9 +1925,8 @@ static bool InstrumentAllFunctions(
     FunctionInstrumenter FI(M, F, TLI, ComdatMembers, BPI, BFI,
                             InstrumentationType);
     FI.instrument();
-    AnythingInstrumented = true;
   }
-  return AnythingInstrumented;
+  return true;
 }
 
 PreservedAnalyses
diff --git a/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp b/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp
index a4c076a8752fc3..9ccb13934cbd38 100644
--- a/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp
+++ b/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp
@@ -103,13 +103,9 @@ class MockModuleAnalysisHandle
                ModuleAnalysisManager::Invalidator &));
 };
 
-template <typename ParamType> struct PGOTestName {
-  std::string operator()(const TestParamInfo<ParamType> &Info) const {
-    return std::get<1>(Info.param).str();
-  }
-};
-
-struct PGOInstrumentationGenTest : public Test {
+struct PGOInstrumentationGenTest
+    : public Test,
+      WithParamInterface<std::tuple<StringRef, StringRef>> {
   ModulePassManager MPM;
   PassBuilder PB;
   MockModuleAnalysisHandle MMAHandle;
@@ -145,47 +141,12 @@ struct PGOInstrumentationGenTest : public Test {
   }
 };
 
-struct PGOInstrumentationGenInstrumentTest
-    : PGOInstrumentationGenTest,
-      WithParamInterface<std::tuple<StringRef, StringRef>> {};
-
 static constexpr StringRef CodeWithFuncDefs = R"(
   define i32 @f(i32 %n) {
   entry:
     ret i32 0
   })";
 
-INSTANTIATE_TEST_SUITE_P(
-    PGOInstrumetationGenTestSuite, PGOInstrumentationGenInstrumentTest,
-    Values(std::make_tuple(CodeWithFuncDefs, "instrument_function_defs")),
-    PGOTestName<PGOInstrumentationGenInstrumentTest::ParamType>());
-
-TEST_P(PGOInstrumentationGenInstrumentTest, Instrumented) {
-  const StringRef Code = std::get<0>(GetParam());
-  parseAssembly(Code);
-
-  ASSERT_THAT(M, NotNull());
-
-  Sequence PassSequence;
-  EXPECT_CALL(MMAHandle, run(Ref(*M), _))
-      .InSequence(PassSequence)
-      .WillOnce(DoDefault());
-  EXPECT_CALL(MMAHandle, invalidate(Ref(*M), _, _))
-      .InSequence(PassSequence)
-      .WillOnce(DoDefault());
-
-  MPM.run(*M, MAM);
-
-  const auto *IRInstrVar =
-      M->getNamedGlobal(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR));
-  ASSERT_THAT(IRInstrVar, NotNull());
-  EXPECT_FALSE(IRInstrVar->isDeclaration());
-}
-
-struct PGOInstrumentationGenIgnoreTest
-    : PGOInstrumentationGenTest,
-      WithParamInterface<std::tuple<StringRef, StringRef>> {};
-
 static constexpr StringRef CodeWithFuncDecls = R"(
   declare i32 @f(i32);
 )";
@@ -196,26 +157,33 @@ static constexpr StringRef CodeWithGlobals = R"(
 )";
 
 INSTANTIATE_TEST_SUITE_P(
-    PGOInstrumetationGenIgnoreTestSuite, PGOInstrumentationGenIgnoreTest,
-    Values(std::make_tuple(CodeWithFuncDecls, "instrument_function_decls"),
+    PGOInstrumetationGenTestSuite, PGOInstrumentationGenTest,
+    Values(std::make_tuple(CodeWithFuncDefs, "instrument_function_defs"),
+           std::make_tuple(CodeWithFuncDecls, "instrument_function_decls"),
            std::make_tuple(CodeWithGlobals, "instrument_globals")),
-    PGOTestName<PGOInstrumentationGenIgnoreTest::ParamType>());
+    [](const TestParamInfo<PGOInstrumentationGenTest::ParamType> &Info) {
+      return std::get<1>(Info.param).str();
+    });
 
-TEST_P(PGOInstrumentationGenIgnoreTest, NotInstrumented) {
+TEST_P(PGOInstrumentationGenTest, Instrumented) {
   const StringRef Code = std::get<0>(GetParam());
-
   parseAssembly(Code);
 
   ASSERT_THAT(M, NotNull());
 
-  EXPECT_CALL(MMAHandle, run(Ref(*M), _)).WillOnce(DoDefault());
-  EXPECT_CALL(MMAHandle, invalidate(Ref(*M), _, _)).Times(0);
+  Sequence PassSequence;
+  EXPECT_CALL(MMAHandle, run(Ref(*M), _))
+      .InSequence(PassSequence)
+      .WillOnce(DoDefault());
+  EXPECT_CALL(MMAHandle, invalidate(Ref(*M), _, _))
+      .InSequence(PassSequence)
+      .WillOnce(DoDefault());
 
   MPM.run(*M, MAM);
 
   const auto *IRInstrVar =
       M->getNamedGlobal(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR));
-  ASSERT_THAT(IRInstrVar, NotNull());
+  EXPECT_THAT(IRInstrVar, NotNull());
   EXPECT_FALSE(IRInstrVar->isDeclaration());
 }
 

>From 488d3924dd28d0402a4c32a6386865adc936d368 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <christudasan.devadasan at amd.com>
Date: Wed, 16 Oct 2024 13:22:57 +0530
Subject: [PATCH 18/82] [CodeGen][NewPM] Port EarlyIfConversion pass to NPM.
 (#108508)

---
 llvm/include/llvm/CodeGen/EarlyIfConversion.h | 24 ++++++
 llvm/include/llvm/CodeGen/Passes.h            |  2 +-
 llvm/include/llvm/InitializePasses.h          |  2 +-
 llvm/include/llvm/Passes/CodeGenPassBuilder.h |  1 +
 .../llvm/Passes/MachinePassRegistry.def       |  2 +-
 llvm/lib/CodeGen/CodeGen.cpp                  |  2 +-
 llvm/lib/CodeGen/EarlyIfConversion.cpp        | 79 ++++++++++++++-----
 llvm/lib/CodeGen/TargetPassConfig.cpp         |  4 +-
 llvm/lib/Passes/PassBuilder.cpp               |  1 +
 .../Target/AArch64/AArch64TargetMachine.cpp   |  2 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  2 +-
 llvm/lib/Target/PowerPC/PPCTargetMachine.cpp  |  2 +-
 .../Target/SystemZ/SystemZTargetMachine.cpp   |  2 +-
 llvm/lib/Target/X86/X86TargetMachine.cpp      |  2 +-
 .../early-ifcvt-likely-predictable.mir        |  1 +
 .../AArch64/early-ifcvt-regclass-mismatch.mir |  1 +
 .../AArch64/early-ifcvt-same-value.mir        |  1 +
 .../CodeGen/PowerPC/early-ifcvt-no-isel.mir   |  2 +
 18 files changed, 102 insertions(+), 30 deletions(-)
 create mode 100644 llvm/include/llvm/CodeGen/EarlyIfConversion.h

diff --git a/llvm/include/llvm/CodeGen/EarlyIfConversion.h b/llvm/include/llvm/CodeGen/EarlyIfConversion.h
new file mode 100644
index 00000000000000..78bf12ade02c3d
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/EarlyIfConversion.h
@@ -0,0 +1,24 @@
+//===- llvm/CodeGen/EarlyIfConversion.h -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_EARLYIFCONVERSION_H
+#define LLVM_CODEGEN_EARLYIFCONVERSION_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class EarlyIfConverterPass : public PassInfoMixin<EarlyIfConverterPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_EARLYIFCONVERSION_H
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 99421bdf769ffa..bbbf99626098a6 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -273,7 +273,7 @@ namespace llvm {
 
   /// EarlyIfConverter - This pass performs if-conversion on SSA form by
   /// inserting cmov instructions.
-  extern char &EarlyIfConverterID;
+  extern char &EarlyIfConverterLegacyID;
 
   /// EarlyIfPredicator - This pass performs if-conversion on SSA form by
   /// predicating if/else block and insert select at the join point.
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 5ed0ad98a2a72d..1374880b6a716b 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -98,7 +98,7 @@ void initializeDominatorTreeWrapperPassPass(PassRegistry &);
 void initializeDwarfEHPrepareLegacyPassPass(PassRegistry &);
 void initializeEarlyCSELegacyPassPass(PassRegistry &);
 void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry &);
-void initializeEarlyIfConverterPass(PassRegistry &);
+void initializeEarlyIfConverterLegacyPass(PassRegistry &);
 void initializeEarlyIfPredicatorPass(PassRegistry &);
 void initializeEarlyMachineLICMPass(PassRegistry &);
 void initializeEarlyTailDuplicatePass(PassRegistry &);
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 0d45df08cb0ca7..9ef6e39dbb1cdd 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/CodeGenPrepare.h"
 #include "llvm/CodeGen/DeadMachineInstructionElim.h"
 #include "llvm/CodeGen/DwarfEHPrepare.h"
+#include "llvm/CodeGen/EarlyIfConversion.h"
 #include "llvm/CodeGen/ExpandLargeDivRem.h"
 #include "llvm/CodeGen/ExpandLargeFpConvert.h"
 #include "llvm/CodeGen/ExpandMemCmp.h"
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 1d7084354455c7..4e44d0312ede47 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -129,6 +129,7 @@ MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis())
 #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
 MACHINE_FUNCTION_PASS("dead-mi-elimination", DeadMachineInstructionElimPass())
+MACHINE_FUNCTION_PASS("early-ifcvt", EarlyIfConverterPass())
 MACHINE_FUNCTION_PASS("early-machinelicm", EarlyMachineLICMPass())
 MACHINE_FUNCTION_PASS("finalize-isel", FinalizeISelPass())
 MACHINE_FUNCTION_PASS("localstackalloc", LocalStackSlotAllocationPass())
@@ -205,7 +206,6 @@ DUMMY_MACHINE_FUNCTION_PASS("cfi-fixup", CFIFixupPass)
 DUMMY_MACHINE_FUNCTION_PASS("cfi-instr-inserter", CFIInstrInserterPass)
 DUMMY_MACHINE_FUNCTION_PASS("detect-dead-lanes", DetectDeadLanesPass)
 DUMMY_MACHINE_FUNCTION_PASS("dot-machine-cfg", MachineCFGPrinter)
-DUMMY_MACHINE_FUNCTION_PASS("early-ifcvt", EarlyIfConverterPass)
 DUMMY_MACHINE_FUNCTION_PASS("early-tailduplication", EarlyTailDuplicatePass)
 DUMMY_MACHINE_FUNCTION_PASS("fentry-insert", FEntryInserterPass)
 DUMMY_MACHINE_FUNCTION_PASS("fixup-statepoint-caller-saved", FixupStatepointCallerSavedPass)
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 48cc21ee20f0af..2d7f351de54ecb 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -35,7 +35,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeDebugifyMachineModulePass(Registry);
   initializeDetectDeadLanesPass(Registry);
   initializeDwarfEHPrepareLegacyPassPass(Registry);
-  initializeEarlyIfConverterPass(Registry);
+  initializeEarlyIfConverterLegacyPass(Registry);
   initializeEarlyIfPredicatorPass(Registry);
   initializeEarlyMachineLICMPass(Registry);
   initializeEarlyTailDuplicatePass(Registry);
diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp
index 53cf6a5169795f..3e73995846176e 100644
--- a/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -15,6 +15,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/EarlyIfConversion.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -760,7 +761,7 @@ void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock *> &RemoveBlocks,
 //===----------------------------------------------------------------------===//
 
 namespace {
-class EarlyIfConverter : public MachineFunctionPass {
+class EarlyIfConverter {
   const TargetInstrInfo *TII = nullptr;
   const TargetRegisterInfo *TRI = nullptr;
   MCSchedModel SchedModel;
@@ -772,31 +773,41 @@ class EarlyIfConverter : public MachineFunctionPass {
   SSAIfConv IfConv;
 
 public:
-  static char ID;
-  EarlyIfConverter() : MachineFunctionPass(ID) {}
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  bool runOnMachineFunction(MachineFunction &MF) override;
-  StringRef getPassName() const override { return "Early If-Conversion"; }
+  EarlyIfConverter(MachineDominatorTree &DT, MachineLoopInfo &LI,
+                   MachineTraceMetrics &MTM)
+      : DomTree(&DT), Loops(&LI), Traces(&MTM) {}
+  EarlyIfConverter() = delete;
+
+  bool run(MachineFunction &MF);
 
 private:
   bool tryConvertIf(MachineBasicBlock *);
   void invalidateTraces();
   bool shouldConvertIf();
 };
+
+class EarlyIfConverterLegacy : public MachineFunctionPass {
+public:
+  static char ID;
+  EarlyIfConverterLegacy() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  StringRef getPassName() const override { return "Early If-Conversion"; }
+};
 } // end anonymous namespace
 
-char EarlyIfConverter::ID = 0;
-char &llvm::EarlyIfConverterID = EarlyIfConverter::ID;
+char EarlyIfConverterLegacy::ID = 0;
+char &llvm::EarlyIfConverterLegacyID = EarlyIfConverterLegacy::ID;
 
-INITIALIZE_PASS_BEGIN(EarlyIfConverter, DEBUG_TYPE,
-                      "Early If Converter", false, false)
+INITIALIZE_PASS_BEGIN(EarlyIfConverterLegacy, DEBUG_TYPE, "Early If Converter",
+                      false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass)
-INITIALIZE_PASS_END(EarlyIfConverter, DEBUG_TYPE,
-                    "Early If Converter", false, false)
+INITIALIZE_PASS_END(EarlyIfConverterLegacy, DEBUG_TYPE, "Early If Converter",
+                    false, false)
 
-void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const {
+void EarlyIfConverterLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
   AU.addRequired<MachineDominatorTreeWrapperPass>();
   AU.addPreserved<MachineDominatorTreeWrapperPass>();
@@ -1076,11 +1087,9 @@ bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
   return Changed;
 }
 
-bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
+bool EarlyIfConverter::run(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n"
                     << "********** Function: " << MF.getName() << '\n');
-  if (skipFunction(MF.getFunction()))
-    return false;
 
   // Only run if conversion if the target wants it.
   const TargetSubtargetInfo &STI = MF.getSubtarget();
@@ -1091,9 +1100,6 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
   TRI = STI.getRegisterInfo();
   SchedModel = STI.getSchedModel();
   MRI = &MF.getRegInfo();
-  DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
-  Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
-  Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
   MinInstr = nullptr;
 
   bool Changed = false;
@@ -1110,6 +1116,41 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
   return Changed;
 }
 
+PreservedAnalyses
+EarlyIfConverterPass::run(MachineFunction &MF,
+                          MachineFunctionAnalysisManager &MFAM) {
+  if (MF.getFunction().hasOptNone())
+    return PreservedAnalyses::all();
+
+  MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
+  MachineLoopInfo &LI = MFAM.getResult<MachineLoopAnalysis>(MF);
+  MachineTraceMetrics &MTM = MFAM.getResult<MachineTraceMetricsAnalysis>(MF);
+
+  EarlyIfConverter Impl(MDT, LI, MTM);
+  bool Changed = Impl.run(MF);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  auto PA = getMachineFunctionPassPreservedAnalyses();
+  PA.preserve<MachineDominatorTreeAnalysis>();
+  PA.preserve<MachineLoopAnalysis>();
+  PA.preserve<MachineTraceMetricsAnalysis>();
+  return PA;
+}
+
+bool EarlyIfConverterLegacy::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  MachineDominatorTree &MDT =
+      getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  MachineLoopInfo &LI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+  MachineTraceMetrics &MTM =
+      getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
+
+  return EarlyIfConverter(MDT, LI, MTM).run(MF);
+}
+
 //===----------------------------------------------------------------------===//
 //                           EarlyIfPredicator Pass
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index cf9d63df2515cc..02c3a852697580 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -305,7 +305,7 @@ static IdentifyingPassPtr overridePass(AnalysisID StandardID,
   if (StandardID == &DeadMachineInstructionElimID)
     return applyDisable(TargetID, DisableMachineDCE);
 
-  if (StandardID == &EarlyIfConverterID)
+  if (StandardID == &EarlyIfConverterLegacyID)
     return applyDisable(TargetID, DisableEarlyIfConversion);
 
   if (StandardID == &EarlyMachineLICMID)
@@ -521,7 +521,7 @@ void llvm::registerCodeGenCallback(PassInstrumentationCallbacks &PIC,
     DISABLE_PASS(DisableBlockPlacement, MachineBlockPlacementPass)
     DISABLE_PASS(DisableBranchFold, BranchFolderPass)
     DISABLE_PASS(DisableCopyProp, MachineCopyPropagationPass)
-    DISABLE_PASS(DisableEarlyIfConversion, EarlyIfConverterPass)
+    DISABLE_PASS(DisableEarlyIfConversion, EarlyIfConverterLegacyPass)
     DISABLE_PASS(DisableEarlyTailDup, EarlyTailDuplicatePass)
     DISABLE_PASS(DisableMachineCSE, MachineCSELegacyPass)
     DISABLE_PASS(DisableMachineDCE, DeadMachineInstructionElimPass)
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 08cc4ddbe34327..ebad3507eb5e29 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -82,6 +82,7 @@
 #include "llvm/CodeGen/CodeGenPrepare.h"
 #include "llvm/CodeGen/DeadMachineInstructionElim.h"
 #include "llvm/CodeGen/DwarfEHPrepare.h"
+#include "llvm/CodeGen/EarlyIfConversion.h"
 #include "llvm/CodeGen/ExpandLargeDivRem.h"
 #include "llvm/CodeGen/ExpandLargeFpConvert.h"
 #include "llvm/CodeGen/ExpandMemCmp.h"
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 21b86f5fe5d912..c7bd0390b65620 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -784,7 +784,7 @@ bool AArch64PassConfig::addILPOpts() {
   if (EnableCondBrTuning)
     addPass(createAArch64CondBrTuning());
   if (EnableEarlyIfConversion)
-    addPass(&EarlyIfConverterID);
+    addPass(&EarlyIfConverterLegacyID);
   if (EnableStPairSuppress)
     addPass(createAArch64StorePairSuppressPass());
   addPass(createAArch64SIMDInstrOptPass());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 23ee0c3e896eb3..7cc65c3996fac3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1335,7 +1335,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
 
 bool GCNPassConfig::addILPOpts() {
   if (EnableEarlyIfConversion)
-    addPass(&EarlyIfConverterID);
+    addPass(&EarlyIfConverterLegacyID);
 
   TargetPassConfig::addILPOpts();
   return false;
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 7d0455942923dd..cd188304595e18 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -521,7 +521,7 @@ bool PPCPassConfig::addPreISel() {
 }
 
 bool PPCPassConfig::addILPOpts() {
-  addPass(&EarlyIfConverterID);
+  addPass(&EarlyIfConverterLegacyID);
 
   if (EnableMachineCombinerPass)
     addPass(&MachineCombinerID);
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 53ed46f14f14dc..f76f41768e886d 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -257,7 +257,7 @@ bool SystemZPassConfig::addInstSelector() {
 }
 
 bool SystemZPassConfig::addILPOpts() {
-  addPass(&EarlyIfConverterID);
+  addPass(&EarlyIfConverterLegacyID);
 
   if (EnableMachineCombinerPass)
     addPass(&MachineCombinerID);
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index ceb87a62dba011..4ba0ac11d20953 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -536,7 +536,7 @@ bool X86PassConfig::addGlobalInstructionSelect() {
 }
 
 bool X86PassConfig::addILPOpts() {
-  addPass(&EarlyIfConverterID);
+  addPass(&EarlyIfConverterLegacyID);
   if (EnableMachineCombinerPass)
     addPass(&MachineCombinerID);
   addPass(createX86CmovConverterPass());
diff --git a/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir b/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir
index 425a23214871d6..ab5e320725d5bd 100644
--- a/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir
+++ b/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=arm64-apple-ios -mcpu=apple-m1 -run-pass=early-ifcvt -o - %s | FileCheck %s
+# RUN: llc -mtriple=arm64-apple-ios -mcpu=apple-m1 -passes=early-ifcvt -o - %s | FileCheck %s
 
 --- |
   define void @test_cond_is_load_with_invariant_ops() {
diff --git a/llvm/test/CodeGen/AArch64/early-ifcvt-regclass-mismatch.mir b/llvm/test/CodeGen/AArch64/early-ifcvt-regclass-mismatch.mir
index 318bdceeaef41d..a7f67f8b682c3c 100644
--- a/llvm/test/CodeGen/AArch64/early-ifcvt-regclass-mismatch.mir
+++ b/llvm/test/CodeGen/AArch64/early-ifcvt-regclass-mismatch.mir
@@ -1,4 +1,5 @@
 # RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=early-ifcvt -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-unknown-unknown -passes=early-ifcvt -verify-each %s -o - | FileCheck %s
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
   target triple = "arm64-apple-ios13.3.0"
diff --git a/llvm/test/CodeGen/AArch64/early-ifcvt-same-value.mir b/llvm/test/CodeGen/AArch64/early-ifcvt-same-value.mir
index b9298608e192fc..16d5dfc78f564b 100644
--- a/llvm/test/CodeGen/AArch64/early-ifcvt-same-value.mir
+++ b/llvm/test/CodeGen/AArch64/early-ifcvt-same-value.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=aarch64-- -run-pass=early-ifcvt -stress-early-ifcvt -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-- -passes=early-ifcvt -stress-early-ifcvt %s -o - | FileCheck %s
 
 ---
 name:            fmov0
diff --git a/llvm/test/CodeGen/PowerPC/early-ifcvt-no-isel.mir b/llvm/test/CodeGen/PowerPC/early-ifcvt-no-isel.mir
index 99a3f80ff81b4d..794480bfc6cec1 100644
--- a/llvm/test/CodeGen/PowerPC/early-ifcvt-no-isel.mir
+++ b/llvm/test/CodeGen/PowerPC/early-ifcvt-no-isel.mir
@@ -1,6 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -mtriple=powerpc64-ibm-aix -mcpu=pwr7 -simplify-mir -verify-machineinstrs \
 # RUN:   -run-pass=early-ifcvt %s -o - | FileCheck %s
+# RUN: llc -mtriple=powerpc64-ibm-aix -mcpu=pwr7 -simplify-mir -verify-each \
+# RUN:   -passes=early-ifcvt %s -o - | FileCheck %s
 
 --- |
   source_filename = "<stdin>"

>From c137b3ee357b6e7564f6717bcfb56e28044fc583 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me at antoniofrighetto.com>
Date: Wed, 16 Oct 2024 09:47:58 +0200
Subject: [PATCH 19/82] [X86] Introduce test for PR112098 (NFC)

---
 .../test/CodeGen/X86/tailcall-caller-nocsr.ll | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll

diff --git a/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll b/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll
new file mode 100644
index 00000000000000..5606fbb27032fb
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=-sse,-avx | FileCheck %s
+
+ at .str = private unnamed_addr constant [6 x i8] c"%d %d\00", align 1
+
+define void @caller(i32 %0, i32 %1) #0 {
+; CHECK-LABEL: caller:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %r11
+; CHECK-NEXT:    pushq %r10
+; CHECK-NEXT:    pushq %r9
+; CHECK-NEXT:    pushq %r8
+; CHECK-NEXT:    pushq %rdx
+; CHECK-NEXT:    pushq %rcx
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    movl %edi, %esi
+; CHECK-NEXT:    movl $.L.str, %edi
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    popq %rdx
+; CHECK-NEXT:    popq %r8
+; CHECK-NEXT:    popq %r9
+; CHECK-NEXT:    popq %r10
+; CHECK-NEXT:    popq %r11
+; CHECK-NEXT:    jmp printf at PLT # TAILCALL
+  %3 = tail call i32 @printf(ptr @.str, i32 %0, i32 %1)
+  ret void
+}
+
+declare i32 @printf(ptr, ...) nounwind
+
+attributes #0 = { mustprogress nounwind "no_caller_saved_registers" }

>From d3a8363beccf4a45f222c63b20ba94e8d450c8db Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me at antoniofrighetto.com>
Date: Wed, 16 Oct 2024 09:48:32 +0200
Subject: [PATCH 20/82] [X86] Do not elect to tail call if caller must preserve
 all registers

A miscompilation issue has been addressed with improved checking.

Fixes: https://github.com/llvm/llvm-project/issues/97758.
---
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp    | 7 +++++++
 llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll | 4 +++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 8561658379f7e4..12cd92e2d0d773 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -2856,6 +2856,13 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
       return false;
   }
 
+  // The stack frame of the caller cannot be replaced by the tail-callee one's
+  // if the function is required to preserve all the registers. Conservatively
+  // prevent tail optimization even if hypothetically all the registers are used
+  // for passing formal parameters or returning values.
+  if (CallerF.hasFnAttribute("no_caller_saved_registers"))
+    return false;
+
   unsigned StackArgsSize = CCInfo.getStackSize();
 
   // If the callee takes no arguments then go on to check the results of the
diff --git a/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll b/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll
index 5606fbb27032fb..0385017a1ced73 100644
--- a/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll
+++ b/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll
@@ -13,8 +13,10 @@ define void @caller(i32 %0, i32 %1) #0 {
 ; CHECK-NEXT:    pushq %rdx
 ; CHECK-NEXT:    pushq %rcx
 ; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    movl %esi, %edx
 ; CHECK-NEXT:    movl %edi, %esi
 ; CHECK-NEXT:    movl $.L.str, %edi
+; CHECK-NEXT:    callq printf at PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    popq %rdx
@@ -22,7 +24,7 @@ define void @caller(i32 %0, i32 %1) #0 {
 ; CHECK-NEXT:    popq %r9
 ; CHECK-NEXT:    popq %r10
 ; CHECK-NEXT:    popq %r11
-; CHECK-NEXT:    jmp printf at PLT # TAILCALL
+; CHECK-NEXT:    retq
   %3 = tail call i32 @printf(ptr @.str, i32 %0, i32 %1)
   ret void
 }

>From 72a7b471de9ad6d9bf6540f02a10774c0b246e2e Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <christudasan.devadasan at amd.com>
Date: Wed, 16 Oct 2024 13:30:46 +0530
Subject: [PATCH 21/82] [AMDGPU][NewPM] Fill out addILPOpts. (#108514)

---
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 7 +++++++
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h   | 1 +
 2 files changed, 8 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 7cc65c3996fac3..e4cc522194f2a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1983,6 +1983,13 @@ void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
   addPass(RequireAnalysisPass<UniformityInfoAnalysis, Function>());
 }
 
+void AMDGPUCodeGenPassBuilder::addILPOpts(AddMachinePass &addPass) const {
+  if (EnableEarlyIfConversion)
+    addPass(EarlyIfConverterPass());
+
+  Base::addILPOpts(addPass);
+}
+
 void AMDGPUCodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass,
                                              CreateMCStreamer) const {
   // TODO: Add AsmPrinter.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index af8476bc21ec61..d8a5111e5898d7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -172,6 +172,7 @@ class AMDGPUCodeGenPassBuilder
   void addIRPasses(AddIRPass &) const;
   void addCodeGenPrepare(AddIRPass &) const;
   void addPreISel(AddIRPass &addPass) const;
+  void addILPOpts(AddMachinePass &) const;
   void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
   Error addInstSelector(AddMachinePass &) const;
   void addMachineSSAOptimization(AddMachinePass &) const;

>From 4ddea298e60c31d0995c06189a592895d2ad512b Mon Sep 17 00:00:00 2001
From: Hans <hans at hanshq.net>
Date: Wed, 16 Oct 2024 10:06:43 +0200
Subject: [PATCH 22/82] [clang-cl]: Add /std:c++23preview and update _MSVC_LANG
 for C++23 (#112378)

As discussed in
https://discourse.llvm.org/t/clang-cl-adding-std-c-23preview/82553
---
 clang/docs/ReleaseNotes.rst                     | 2 ++
 clang/include/clang/Driver/Options.td           | 2 +-
 clang/lib/Basic/Targets/OSTargets.cpp           | 6 ++++--
 clang/lib/Driver/ToolChains/Clang.cpp           | 1 +
 clang/test/Driver/cl-options.c                  | 3 +++
 clang/test/Preprocessor/predefined-win-macros.c | 7 ++++++-
 6 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 817e3abef8d566..33eb9a2b5804a0 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -605,6 +605,8 @@ Android Support
 Windows Support
 ^^^^^^^^^^^^^^^
 
+- clang-cl now supports ``/std:c++23preview`` which enables C++23 features.
+
 - Clang no longer allows references inside a union when emulating MSVC 1900+ even if `fms-extensions` is enabled.
   Starting with VS2015, MSVC 1900, this Microsoft extension is no longer allowed and always results in an error.
   Clang now follows the MSVC behavior in this scenario.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2072ae45d55414..379e75b197cf96 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -8531,7 +8531,7 @@ def _SLASH_execution_charset : CLCompileJoined<"execution-charset:">,
   HelpText<"Set runtime encoding, supports only UTF-8">,
   Alias<fexec_charset_EQ>;
 def _SLASH_std : CLCompileJoined<"std:">,
-  HelpText<"Set language version (c++14,c++17,c++20,c++latest,c11,c17)">;
+  HelpText<"Set language version (c++14,c++17,c++20,c++23preview,c++latest,c11,c17)">;
 def _SLASH_U : CLJoinedOrSeparate<"U">, HelpText<"Undefine macro">,
   MetaVarName<"<macro>">, Alias<U>;
 def _SLASH_validate_charset : CLFlag<"validate-charset">,
diff --git a/clang/lib/Basic/Targets/OSTargets.cpp b/clang/lib/Basic/Targets/OSTargets.cpp
index b56e2c7ca9c494..88c054150ab224 100644
--- a/clang/lib/Basic/Targets/OSTargets.cpp
+++ b/clang/lib/Basic/Targets/OSTargets.cpp
@@ -214,9 +214,11 @@ static void addVisualCDefines(const LangOptions &Opts, MacroBuilder &Builder) {
       Builder.defineMacro("_HAS_CHAR16_T_LANGUAGE_SUPPORT", Twine(1));
 
     if (Opts.isCompatibleWithMSVC(LangOptions::MSVC2015)) {
-      if (Opts.CPlusPlus23)
+      if (Opts.CPlusPlus26)
         // TODO update to the proper value.
-        Builder.defineMacro("_MSVC_LANG", "202004L");
+        Builder.defineMacro("_MSVC_LANG", "202400L");
+      else if (Opts.CPlusPlus23)
+        Builder.defineMacro("_MSVC_LANG", "202302L");
       else if (Opts.CPlusPlus20)
         Builder.defineMacro("_MSVC_LANG", "202002L");
       else if (Opts.CPlusPlus17)
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index c132fa35098ae4..3fc39296f44281 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7225,6 +7225,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                              .Case("c++17", "-std=c++17")
                              .Case("c++20", "-std=c++20")
                              // TODO add c++23 and c++26 when MSVC supports it.
+                             .Case("c++23preview", "-std=c++23")
                              .Case("c++latest", "-std=c++26")
                              .Default("");
       if (LanguageStandard.empty())
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index 48d281bcd447e7..8191fda97788c1 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -605,6 +605,9 @@
 // RUN: %clang_cl -fmsc-version=1900 -TP -std:c++20 -### -- %s 2>&1 | FileCheck -check-prefix=STDCXX20 %s
 // STDCXX20: -std=c++20
 
+// RUN: %clang_cl -fmsc-version=1900 -TP -std:c++23preview -### -- %s 2>&1 | FileCheck -check-prefix=STDCXX23PREVIEW %s
+// STDCXX23PREVIEW: -std=c++23
+
 // RUN: %clang_cl -fmsc-version=1900 -TP -std:c++latest -### -- %s 2>&1 | FileCheck -check-prefix=STDCXXLATEST %s
 // STDCXXLATEST: -std=c++26
 
diff --git a/clang/test/Preprocessor/predefined-win-macros.c b/clang/test/Preprocessor/predefined-win-macros.c
index 7d29e45c7d5ac6..8e539a2a1faf83 100644
--- a/clang/test/Preprocessor/predefined-win-macros.c
+++ b/clang/test/Preprocessor/predefined-win-macros.c
@@ -56,7 +56,12 @@
 // RUN: %clang_cc1 %s -x c++ -E -dM -triple i686-pc-win32 -fms-extensions -fms-compatibility \
 // RUN:     -fms-compatibility-version=19.00 -std=c++23 -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS-CPP2B
 // CHECK-MS-CPP2B: #define _MSC_VER 1900
-// CHECK-MS-CPP2B: #define _MSVC_LANG 202004L
+// CHECK-MS-CPP2B: #define _MSVC_LANG 202302L
+
+// RUN: %clang_cc1 %s -x c++ -E -dM -triple i686-pc-win32 -fms-extensions -fms-compatibility \
+// RUN:     -fms-compatibility-version=19.00 -std=c++26 -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS-CPP2C
+// CHECK-MS-CPP2C: #define _MSC_VER 1900
+// CHECK-MS-CPP2C: #define _MSVC_LANG 202400L
 
 // RUN: %clang_cc1 -triple i386-windows %s -E -dM -o - \
 // RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-X86-WIN

>From 9df8d8d05c2650b51bd4233e1759206d163f3133 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= <balazs.keri at ericsson.com>
Date: Wed, 16 Oct 2024 10:17:34 +0200
Subject: [PATCH 23/82] [clang][analyzer] Improve test and documentation in
 cstring NotNullTerminated checker (#112019)

CStringChecker has a sub-checker alpha.unix.cstring.NotNullTerminated
which checks for invalid objects passed to string functions. The checker
and its name are not exact and more functions could be checked, this
change only adds some tests and improves documentation.
---
 clang/docs/analyzer/checkers.rst | 17 +++++++++---
 clang/test/Analysis/string.c     | 46 +++++++++++++++++++++++++++-----
 clang/test/Analysis/string.cpp   | 10 ++++++-
 3 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 81264428c72ed1..58dbd686a6dc9f 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -3371,12 +3371,23 @@ Checks for overlap in two buffer arguments. Applies to:  ``memcpy, mempcpy, wmem
 
 alpha.unix.cstring.NotNullTerminated (C)
 """"""""""""""""""""""""""""""""""""""""
-Check for arguments which are not null-terminated strings; applies to: ``strlen, strnlen, strcpy, strncpy, strcat, strncat, wcslen, wcsnlen``.
+Check for arguments which are not null-terminated strings;
+applies to the ``strlen``, ``strcpy``, ``strcat``, ``strcmp`` family of functions.
+
+Only very fundamental cases are detected where the passed memory block is
+absolutely different from a null-terminated string. This checker does not
+find if a memory buffer is passed where the terminating zero character
+is missing.
 
 .. code-block:: c
 
- void test() {
-   int y = strlen((char *)&test); // warn
+ void test1() {
+   int l = strlen((char *)&test); // warn
+ }
+
+ void test2() {
+ label:
+   int l = strlen((char *)&&label); // warn
  }
 
 .. _alpha-unix-cstring-OutOfBounds:
diff --git a/clang/test/Analysis/string.c b/clang/test/Analysis/string.c
index 79b4877eedbd9c..2e0a49d083b0b0 100644
--- a/clang/test/Analysis/string.c
+++ b/clang/test/Analysis/string.c
@@ -361,6 +361,10 @@ void strcpy_fn_const(char *x) {
   strcpy(x, (const char*)&strcpy_fn); // expected-warning{{Argument to string copy function is the address of the function 'strcpy_fn', which is not a null-terminated string}}
 }
 
+void strcpy_fn_dst(const char *x) {
+  strcpy((char*)&strcpy_fn, x); // expected-warning{{Argument to string copy function is the address of the function 'strcpy_fn', which is not a null-terminated string}}
+}
+
 extern int globalInt;
 void strcpy_effects(char *x, char *y) {
   char a = x[0];
@@ -469,8 +473,22 @@ void strcat_null_src(char *x) {
   strcat(x, NULL); // expected-warning{{Null pointer passed as 2nd argument to string concatenation function}}
 }
 
-void strcat_fn(char *x) {
-  strcat(x, (char*)&strcat_fn); // expected-warning{{Argument to string concatenation function is the address of the function 'strcat_fn', which is not a null-terminated string}}
+void strcat_fn_dst(const char *x) {
+  strcat((char*)&strcat_fn_dst, x); // expected-warning{{Argument to string concatenation function is the address of the function 'strcat_fn_dst', which is not a null-terminated string}}
+}
+
+void strcat_fn_src(char *x) {
+  strcat(x, (char*)&strcat_fn_src); // expected-warning{{Argument to string concatenation function is the address of the function 'strcat_fn_src', which is not a null-terminated string}}
+}
+
+void strcat_label_dst(const char *x) {
+label:
+  strcat((char*)&&label, x); // expected-warning{{Argument to string concatenation function is the address of the label 'label', which is not a null-terminated string}}
+}
+
+void strcat_label_src(char *x) {
+label:
+  strcat(x, (char*)&&label); // expected-warning{{Argument to string concatenation function is the address of the label 'label', which is not a null-terminated string}}
 }
 
 void strcat_effects(char *y) {
@@ -568,8 +586,12 @@ void strncpy_null_src(char *x) {
   strncpy(x, NULL, 5); // expected-warning{{Null pointer passed as 2nd argument to string copy function}}
 }
 
-void strncpy_fn(char *x) {
-  strncpy(x, (char*)&strcpy_fn, 5); // expected-warning{{Argument to string copy function is the address of the function 'strcpy_fn', which is not a null-terminated string}}
+void strncpy_fn_src(char *x) {
+  strncpy(x, (char*)&strncpy_fn_src, 5); // expected-warning{{Argument to string copy function is the address of the function 'strncpy_fn_src', which is not a null-terminated string}}
+}
+
+void strncpy_fn_dst(const char *x) {
+  strncpy((char*)&strncpy_fn_dst, x, 5); // expected-warning{{Argument to string copy function is the address of the function 'strncpy_fn_dst', which is not a null-terminated string}}
 }
 
 void strncpy_effects(char *x, char *y) {
@@ -680,8 +702,12 @@ void strncat_null_src(char *x) {
   strncat(x, NULL, 4); // expected-warning{{Null pointer passed as 2nd argument to string concatenation function}}
 }
 
-void strncat_fn(char *x) {
-  strncat(x, (char*)&strncat_fn, 4); // expected-warning{{Argument to string concatenation function is the address of the function 'strncat_fn', which is not a null-terminated string}}
+void strncat_fn_src(char *x) {
+  strncat(x, (char*)&strncat_fn_src, 4); // expected-warning{{Argument to string concatenation function is the address of the function 'strncat_fn_src', which is not a null-terminated string}}
+}
+
+void strncat_fn_dst(const char *x) {
+  strncat((char*)&strncat_fn_dst, x, 4); // expected-warning{{Argument to string concatenation function is the address of the function 'strncat_fn_dst', which is not a null-terminated string}}
 }
 
 void strncat_effects(char *y) {
@@ -921,6 +947,14 @@ int strcmp_null_argument(char *a) {
   return strcmp(a, b); // expected-warning{{Null pointer passed as 2nd argument to string comparison function}}
 }
 
+void strcmp_fn_r(char *x) {
+  strcmp(x, (char*)&strcmp_null_argument); // expected-warning{{Argument to string comparison function is the address of the function 'strcmp_null_argument', which is not a null-terminated string}}
+}
+
+void strcmp_fn_l(char *x) {
+  strcmp((char*)&strcmp_null_argument, x); // expected-warning{{Argument to string comparison function is the address of the function 'strcmp_null_argument', which is not a null-terminated string}}
+}
+
 //===----------------------------------------------------------------------===
 // strncmp()
 //===----------------------------------------------------------------------===
diff --git a/clang/test/Analysis/string.cpp b/clang/test/Analysis/string.cpp
index 1be6c21466cc03..c09422d1922369 100644
--- a/clang/test/Analysis/string.cpp
+++ b/clang/test/Analysis/string.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix,debug.ExprInspection -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix,alpha.unix.cstring,debug.ExprInspection -verify %s
 
 // Test functions that are called "memcpy" but aren't the memcpy
 // we're looking for. Unfortunately, this test cannot be put into
@@ -6,6 +6,7 @@
 // as a normal C function for the test to make sense.
 typedef __typeof(sizeof(int)) size_t;
 void *memcpy(void *, const void *, size_t);
+size_t strlen(const char *s);
 
 int sprintf(char *str, const char *format, ...);
 int snprintf(char *str, size_t size, const char *format, ...);
@@ -45,3 +46,10 @@ void log(const char* fmt, const Args&... args) {
 void test_gh_74269_no_crash() {
   log("%d", 1);
 }
+
+struct TestNotNullTerm {
+  void test1() {
+    TestNotNullTerm * const &x = this;
+    strlen((char *)&x); // expected-warning{{Argument to string length function is not a null-terminated string}}
+  }
+};

>From 3bf2295ee0ebd1eafe66ca15dff44bdb31e6198a Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Wed, 16 Oct 2024 16:24:44 +0800
Subject: [PATCH 24/82] [InstCombine] Drop `samesign` flag in
 `foldAndOrOfICmpsWithConstEq` (#112489)

In
https://github.com/llvm/llvm-project/commit/5dbfca30c1a672cd0c5089df2b4fdd171436643a
we assume that RHS is poison implies LHS is also poison. It doesn't hold
after introducing samesign flag.

This patch drops the `samesign` flag on RHS if the original expression
is a logical and/or.

Closes #112467.
---
 .../Transforms/InstCombine/InstCombineAndOrXor.cpp  |  8 +++++++-
 .../Transforms/InstCombine/and-or-icmp-min-max.ll   | 11 +++++++++++
 .../Transforms/InstCombine/and-or-icmp-nullptr.ll   | 13 +++++++++++++
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 64bee4ab974ede..c8407e8ba5aba8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3369,8 +3369,14 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   // We can convert this case to bitwise and, because both operands are used
   // on the LHS, and as such poison from both will propagate.
   if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, IsAnd,
-                                             /*IsLogical*/ false, Builder, Q))
+                                             /*IsLogical=*/false, Builder, Q)) {
+    // If RHS is still used, we should drop samesign flag.
+    if (IsLogical && RHS->hasSameSign() && !RHS->use_empty()) {
+      RHS->setSameSign(false);
+      addToWorklist(RHS);
+    }
     return V;
+  }
 
   if (Value *V = foldIsPowerOf2OrZero(LHS, RHS, IsAnd, Builder, *this))
     return V;
diff --git a/llvm/test/Transforms/InstCombine/and-or-icmp-min-max.ll b/llvm/test/Transforms/InstCombine/and-or-icmp-min-max.ll
index 058847a75bde7f..cc55c4a39a2678 100644
--- a/llvm/test/Transforms/InstCombine/and-or-icmp-min-max.ll
+++ b/llvm/test/Transforms/InstCombine/and-or-icmp-min-max.ll
@@ -689,6 +689,17 @@ define i1 @sge_and_max_logical(i8 %x, i8 %y)  {
   ret i1 %r
 }
 
+define i1 @sge_and_max_logical_samesign(i8 %x, i8 %y)  {
+; CHECK-LABEL: @sge_and_max_logical_samesign(
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 127
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
+;
+  %cmp = icmp sge i8 %x, %y
+  %cmpeq = icmp samesign eq i8 %x, 127
+  %r = select i1 %cmp, i1 %cmpeq, i1 false
+  ret i1 %r
+}
+
 define i1 @sge_and_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_and_max_commute(
 ; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 127
diff --git a/llvm/test/Transforms/InstCombine/and-or-icmp-nullptr.ll b/llvm/test/Transforms/InstCombine/and-or-icmp-nullptr.ll
index d533cc7048536c..8650b89c3b9ef3 100644
--- a/llvm/test/Transforms/InstCombine/and-or-icmp-nullptr.ll
+++ b/llvm/test/Transforms/InstCombine/and-or-icmp-nullptr.ll
@@ -592,6 +592,19 @@ define i1 @sgt_and_min_logical(ptr %x, ptr %y)  {
   ret i1 %r
 }
 
+define i1 @sgt_and_min_logical_samesign(ptr %x, ptr %y)  {
+; CHECK-LABEL: @sgt_and_min_logical_samesign(
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq ptr [[X:%.*]], null
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt ptr [[Y:%.*]], null
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[TMP1]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp sgt ptr %x, %y
+  %cmpeq = icmp samesign eq ptr %x, null
+  %r = select i1 %cmp, i1 %cmpeq, i1 false
+  ret i1 %r
+}
+
 define i1 @sle_or_not_min(ptr %x, ptr %y)  {
 ; CHECK-LABEL: @sle_or_not_min(
 ; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne ptr [[X:%.*]], null

>From f0d7cccaf5ad372c1039d0699e36771ac50fa5c4 Mon Sep 17 00:00:00 2001
From: Ilya Biryukov <ibiryukov at google.com>
Date: Wed, 16 Oct 2024 10:38:53 +0200
Subject: [PATCH 25/82] [Modules][NFC] Rewrite friend-definition-2.cpp with
 split-file (#112380)

Instead of the pragmas, which are less familiar to people.
This is a follow-up of a discussion from #111992.
---
 clang/test/Modules/friend-definition-2.cpp | 59 +++++++++++++++-------
 1 file changed, 40 insertions(+), 19 deletions(-)

diff --git a/clang/test/Modules/friend-definition-2.cpp b/clang/test/Modules/friend-definition-2.cpp
index 41c2141f401392..d91ce14722b45f 100644
--- a/clang/test/Modules/friend-definition-2.cpp
+++ b/clang/test/Modules/friend-definition-2.cpp
@@ -1,32 +1,53 @@
-// RUN: %clang_cc1 -std=c++14 -fmodules %s -verify
-// RUN: %clang_cc1 -std=c++14 -fmodules %s -verify -triple i686-windows
-// expected-no-diagnostics
-#pragma clang module build A
-module A {}
-#pragma clang module contents
-#pragma clang module begin A
+// RUN: split-file %s %t
+
+// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-name=A -emit-module %t/a.modulemap -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-name=B -emit-module %t/b.modulemap -o %t/b.pcm
+// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-map-file=%t/a.modulemap -fmodule-map-file=%t/b.modulemap \
+// RUN:   -fmodule-file=%t/a.pcm -fmodule-file=%t/b.pcm \
+// RUN:   %t/use.cc -verify
+
+// RUN: rm -f %t/*.pcm
+
+// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-name=A -emit-module %t/a.modulemap -o %t/a.pcm -triple i686-windows
+// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-name=B -emit-module %t/b.modulemap -o %t/b.pcm -triple i686-windows
+// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-map-file=%t/a.modulemap -fmodule-map-file=%t/b.modulemap \
+// RUN:   -fmodule-file=%t/a.pcm -fmodule-file=%t/b.pcm \
+// RUN:   %t/use.cc -verify -triple i686-windows
+
+//--- a.modulemap
+module A {
+  header "a.h"
+}
+
+//--- a.h
+#ifndef A_H
+#define A_H
+template<typename T> struct ct { friend auto operator-(ct, ct) { struct X {}; return X(); } void x(); };
+#endif
+
+//--- b.modulemap
+module B {
+  header "b.h"
+}
+
+//--- b.h
+#ifndef B_H
+#define B_H
 template<typename T> struct ct { friend auto operator-(ct, ct) { struct X {}; return X(); } void x(); };
-#pragma clang module end
-#pragma clang module endbuild
-
-#pragma clang module build B
-module B {}
-#pragma clang module contents
-#pragma clang module begin B
-template<typename T> struct ct { friend auto operator-(ct, ct) { struct X{}; return X(); } void x(); };
 inline auto f() { return ct<float>() - ct<float>(); }
-#pragma clang module end
-#pragma clang module endbuild
+#endif
 
+//--- use.cc
+// expected-no-diagnostics
 // Force the definition of ct in module A to be the primary definition.
-#pragma clang module import A
+#include "a.h"
 template<typename T> void ct<T>::x() {}
 
 // Attempt to cause the definition of operator- in the ct primary template in
 // module B to be the primary definition of that function. If that happens,
 // we'll be left with a class template ct that appears to not contain a
 // definition of the inline friend function.
-#pragma clang module import B
+#include "b.h"
 auto v = f();
 
 ct<int> make();

>From 5059059c7b752904c7e81078395adcdb8cd1d63b Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Wed, 16 Oct 2024 10:39:28 +0200
Subject: [PATCH 26/82] [SystemZ]  Add missing newline character in
 verifyNarrowIntegerArgs_Call(). (#112499)

---
 llvm/lib/Target/SystemZ/SystemZISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index c7626434efac67..83417e570dabf7 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -9863,7 +9863,7 @@ verifyNarrowIntegerArgs_Call(const SmallVectorImpl<ISD::OutputArg> &Outs,
     if (CalleeFn != nullptr)
       printFunctionArgExts(CalleeFn, errs());
     else
-      errs() << "-";
+      errs() << "-\n";
     errs() << "Caller:  ";
     printFunctionArgExts(F, errs());
     llvm_unreachable("");

>From 949177dabc86c99667cb490119e028ce0e7dc628 Mon Sep 17 00:00:00 2001
From: Daniel Grumberg <dgrumberg at apple.com>
Date: Wed, 16 Oct 2024 09:45:43 +0100
Subject: [PATCH 27/82] [clang][ExtractAPI] Fix up casting from CXXClassRecord
 (#110983)

`RecordRecord::classOfKind` and `TagRecord::classofKind` didn't
correctly capture `RK_CXXClass` and derived variants, e.g.
`RK_ClassTemplate`. This materialized by anonymous C++ tag types not
being correctly detected when they need to be merged with another
record.
---
 clang/include/clang/ExtractAPI/API.h          | 37 +++++++++++++++-
 .../ExtractAPI/anonymous_record_no_typedef.c  | 44 +++++++++++++------
 .../ExtractAPI/typedef_anonymous_record.c     | 27 +++++++-----
 3 files changed, 82 insertions(+), 26 deletions(-)

diff --git a/clang/include/clang/ExtractAPI/API.h b/clang/include/clang/ExtractAPI/API.h
index 4f34fcc575e807..c30e6fac66d6ba 100644
--- a/clang/include/clang/ExtractAPI/API.h
+++ b/clang/include/clang/ExtractAPI/API.h
@@ -26,6 +26,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Triple.h"
 #include <cstddef>
 #include <iterator>
@@ -615,7 +616,24 @@ struct TagRecord : APIRecord, RecordContext {
     return classofKind(Record->getKind());
   }
   static bool classofKind(RecordKind K) {
-    return K == RK_Struct || K == RK_Union || K == RK_Enum;
+    switch (K) {
+    case RK_Enum:
+      LLVM_FALLTHROUGH;
+    case RK_Struct:
+      LLVM_FALLTHROUGH;
+    case RK_Union:
+      LLVM_FALLTHROUGH;
+    case RK_CXXClass:
+      LLVM_FALLTHROUGH;
+    case RK_ClassTemplate:
+      LLVM_FALLTHROUGH;
+    case RK_ClassTemplateSpecialization:
+      LLVM_FALLTHROUGH;
+    case RK_ClassTemplatePartialSpecialization:
+      return true;
+    default:
+      return false;
+    }
   }
 
   bool IsEmbeddedInVarDeclarator;
@@ -684,7 +702,22 @@ struct RecordRecord : TagRecord {
     return classofKind(Record->getKind());
   }
   static bool classofKind(RecordKind K) {
-    return K == RK_Struct || K == RK_Union;
+    switch (K) {
+    case RK_Struct:
+      LLVM_FALLTHROUGH;
+    case RK_Union:
+      LLVM_FALLTHROUGH;
+    case RK_CXXClass:
+      LLVM_FALLTHROUGH;
+    case RK_ClassTemplate:
+      LLVM_FALLTHROUGH;
+    case RK_ClassTemplateSpecialization:
+      LLVM_FALLTHROUGH;
+    case RK_ClassTemplatePartialSpecialization:
+      return true;
+    default:
+      return false;
+    }
   }
 
   bool isAnonymousWithNoTypedef() { return Name.empty(); }
diff --git a/clang/test/ExtractAPI/anonymous_record_no_typedef.c b/clang/test/ExtractAPI/anonymous_record_no_typedef.c
index 064c223ad56e73..c0c76ef1f06b57 100644
--- a/clang/test/ExtractAPI/anonymous_record_no_typedef.c
+++ b/clang/test/ExtractAPI/anonymous_record_no_typedef.c
@@ -1,11 +1,18 @@
 // RUN: rm -rf %t
 // RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \
 // RUN:   -triple arm64-apple-macosx -isystem %S -fretain-comments-from-system-headers \
-// RUN:   -x c-header %s -o %t/output.symbols.json -verify
+// RUN:   -x c-header %s -o %t/output-c.symbols.json -verify
+//
+// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \
+// RUN:   -triple arm64-apple-macosx -isystem %S -fretain-comments-from-system-headers \
+// RUN:   -x c++-header %s -o %t/output-cxx.symbols.json -verify
 
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBAL
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix PREFIX
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix CONTENT
+// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix GLOBAL
+// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix PREFIX
+// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix CONTENT
+// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix GLOBAL
+// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix PREFIX
+// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix CONTENT
 /// A global variable with an anonymous struct type.
 struct { char *prefix; char *content; } global;
 // GLOBAL-LABEL: "!testLabel": "c:@global"
@@ -30,7 +37,7 @@ struct { char *prefix; char *content; } global;
 // GLOBAL: "text": "A global variable with an anonymous struct type."
 // GLOBAL:     "kind": {
 // GLOBAL-NEXT:  "displayName": "Global Variable",
-// GLOBAL-NEXT:  "identifier": "c.var"
+// GLOBAL-NEXT:  "identifier": "c{{(\+\+)?}}.var"
 // GLOBAL:       "title": "global"
 // GLOBAL:     "pathComponents": [
 // GLOBAL-NEXT:  "global"
@@ -54,9 +61,12 @@ struct { char *prefix; char *content; } global;
 
 /// A Vehicle
 struct Vehicle {
-    // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix TYPE
-    // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix BICYCLE
-    // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix CAR
+    // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix TYPE
+    // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix BICYCLE
+    // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix CAR
+    // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix TYPE
+    // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix BICYCLE
+    // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix CAR
     /// The type of vehicle.
     enum {
         Bicycle,
@@ -96,9 +106,12 @@ struct Vehicle {
     // CAR-NEXT:   "Car"
     // CAR-NEXT: ]
 
-    // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix INFORMATION
-    // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix WHEELS
-    // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix NAME
+    // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix INFORMATION
+    // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix WHEELS
+    // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix NAME
+    // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix INFORMATION
+    // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix WHEELS
+    // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix NAME
     /// The information about the vehicle.
     union {
         int wheels;
@@ -145,8 +158,10 @@ struct Vehicle {
     // NAME-NEXT: ]
 };
 
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBALCASE
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBALOTHERCASE
+// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix GLOBALCASE
+// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix GLOBALOTHERCASE
+// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix GLOBALCASE
+// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix GLOBALOTHERCASE
 enum {
   GlobalCase,
   GlobalOtherCase
@@ -163,7 +178,8 @@ enum {
 // GLOBALOTHERCASE-NEXT:   "GlobalOtherCase"
 // GLOBALOTHERCASE-NEXT: ]
 
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix VEC
+// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix VEC
+// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix VEC
 union Vector {
   struct {
     float X;
diff --git a/clang/test/ExtractAPI/typedef_anonymous_record.c b/clang/test/ExtractAPI/typedef_anonymous_record.c
index 8e298f8d9ce829..c100e30803e4c8 100644
--- a/clang/test/ExtractAPI/typedef_anonymous_record.c
+++ b/clang/test/ExtractAPI/typedef_anonymous_record.c
@@ -1,8 +1,11 @@
 // RUN: rm -rf %t
 // RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \
-// RUN:   --product-name=TypedefChain -triple arm64-apple-macosx -x c-header %s -o %t/typedefchain.symbols.json -verify
+// RUN:   --product-name=TypedefChain -triple arm64-apple-macosx -x c-header %s -o %t/typedefchain-c.symbols.json -verify
+// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \
+// RUN:   --product-name=TypedefChain -triple arm64-apple-macosx -x c++-header %s -o %t/typedefchain-cxx.symbols.json -verify
 
-// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix MYSTRUCT
+// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix MYSTRUCT
+// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix MYSTRUCT
 typedef struct { } MyStruct;
 // MYSTRUCT-LABEL: "!testLabel": "c:@SA at MyStruct"
 // MYSTRUCT:      "accessLevel": "public",
@@ -34,7 +37,7 @@ typedef struct { } MyStruct;
 // MYSTRUCT-NEXT: ]
 // MYSTRUCT:      "kind": {
 // MYSTRUCT-NEXT:   "displayName": "Structure",
-// MYSTRUCT-NEXT:   "identifier": "c.struct"
+// MYSTRUCT-NEXT:   "identifier": "c{{(\+\+)?}}.struct"
 // MYSTRUCT:           "names": {
 // MYSTRUCT-NEXT:        "navigator": [
 // MYSTRUCT-NEXT:          {
@@ -54,7 +57,8 @@ typedef struct { } MyStruct;
 // MYSTRUCT-NEXT:    "MyStruct"
 // MYSTRUCT-NEXT:  ]
 
-// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix MYSTRUCTSTRUCT
+// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix MYSTRUCTSTRUCT
+// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix MYSTRUCTSTRUCT
 typedef MyStruct MyStructStruct;
 // MYSTRUCTSTRUCT-LABEL: "!testLabel": "c:typedef_anonymous_record.c at T@MyStructStruct"
 // MYSTRUCTSTRUCT: "accessLevel": "public",
@@ -87,10 +91,12 @@ typedef MyStruct MyStructStruct;
 // MYSTRUCTSTRUCT-NEXT:],
 // MYSTRUCTSTRUCT:     "kind": {
 // MYSTRUCTSTRUCT-NEXT:  "displayName": "Type Alias",
-// MYSTRUCTSTRUCT-NEXT:  "identifier": "c.typealias"
+// MYSTRUCTSTRUCT-NEXT:  "identifier": "c{{(\+\+)?}}.typealias"
 
-// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix MYENUM
-// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix CASE
+// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix MYENUM
+// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix CASE
+// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix MYENUM
+// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix CASE
 typedef enum { Case } MyEnum;
 // MYENUM: "source": "c:@EA at MyEnum@Case",
 // MYENUM-NEXT: "target": "c:@EA at MyEnum",
@@ -124,7 +130,7 @@ typedef enum { Case } MyEnum;
 // MYENUM-NEXT:],
 // MYENUM:     "kind": {
 // MYENUM-NEXT:  "displayName": "Enumeration",
-// MYENUM-NEXT:  "identifier": "c.enum"
+// MYENUM-NEXT:  "identifier": "c{{(\+\+)?}}.enum"
 // MYENUM:           "names": {
 // MYENUM-NEXT:        "navigator": [
 // MYENUM-NEXT:          {
@@ -147,7 +153,8 @@ typedef enum { Case } MyEnum;
 // CASE-NEXT:   "Case"
 // CASE-NEXT: ]
 
-// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix MYENUMENUM
+// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix MYENUMENUM
+// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix MYENUMENUM
 typedef MyEnum MyEnumEnum;
 // MYENUMENUM-LABEL: "!testLabel": "c:typedef_anonymous_record.c at T@MyEnumEnum"
 // MYENUMENUM:      "declarationFragments": [
@@ -179,7 +186,7 @@ typedef MyEnum MyEnumEnum;
 // MYENUMENUM-NEXT: ],
 // MYENUMENUM:      "kind": {
 // MYENUMENUM-NEXT:   "displayName": "Type Alias",
-// MYENUMENUM-NEXT:   "identifier": "c.typealias"
+// MYENUMENUM-NEXT:   "identifier": "c{{(\+\+)?}}.typealias"
 // MYENUMENUM-NEXT: },
 // MYENUMENUM: "title": "MyEnumEnum"
 

>From 154929169ab460b6b135103208e7fecd3cfa58f0 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder at redhat.com>
Date: Wed, 16 Oct 2024 10:47:12 +0200
Subject: [PATCH 28/82] [clang] Implement constexpr __builtin_bit_cast for
 complex types (#109981)

Fixes https://github.com/llvm/llvm-project/issues/94620
---
 clang/lib/AST/ExprConstant.cpp                | 43 +++++++++++++++++++
 .../SemaCXX/constexpr-builtin-bit-cast.cpp    | 16 +++++++
 2 files changed, 59 insertions(+)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 52a7f5778ce6d2..8544052d5e4924 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -7237,6 +7237,7 @@ class APValueToBufferConverter {
 
     case APValue::ComplexInt:
     case APValue::ComplexFloat:
+      return visitComplex(Val, Ty, Offset);
     case APValue::FixedPoint:
       // FIXME: We should support these.
 
@@ -7323,6 +7324,31 @@ class APValueToBufferConverter {
     return true;
   }
 
+  bool visitComplex(const APValue &Val, QualType Ty, CharUnits Offset) {
+    const ComplexType *ComplexTy = Ty->castAs<ComplexType>();
+    QualType EltTy = ComplexTy->getElementType();
+    CharUnits EltSizeChars = Info.Ctx.getTypeSizeInChars(EltTy);
+    bool IsInt = Val.isComplexInt();
+
+    if (IsInt) {
+      if (!visitInt(Val.getComplexIntReal(), EltTy,
+                    Offset + (0 * EltSizeChars)))
+        return false;
+      if (!visitInt(Val.getComplexIntImag(), EltTy,
+                    Offset + (1 * EltSizeChars)))
+        return false;
+    } else {
+      if (!visitFloat(Val.getComplexFloatReal(), EltTy,
+                      Offset + (0 * EltSizeChars)))
+        return false;
+      if (!visitFloat(Val.getComplexFloatImag(), EltTy,
+                      Offset + (1 * EltSizeChars)))
+        return false;
+    }
+
+    return true;
+  }
+
   bool visitVector(const APValue &Val, QualType Ty, CharUnits Offset) {
     const VectorType *VTy = Ty->castAs<VectorType>();
     QualType EltTy = VTy->getElementType();
@@ -7595,6 +7621,23 @@ class BufferToAPValueConverter {
     return ArrayValue;
   }
 
+  std::optional<APValue> visit(const ComplexType *Ty, CharUnits Offset) {
+    QualType ElementType = Ty->getElementType();
+    CharUnits ElementWidth = Info.Ctx.getTypeSizeInChars(ElementType);
+    bool IsInt = ElementType->isIntegerType();
+
+    std::optional<APValue> Values[2];
+    for (unsigned I = 0; I != 2; ++I) {
+      Values[I] = visitType(Ty->getElementType(), Offset + I * ElementWidth);
+      if (!Values[I])
+        return std::nullopt;
+    }
+
+    if (IsInt)
+      return APValue(Values[0]->getInt(), Values[1]->getInt());
+    return APValue(Values[0]->getFloat(), Values[1]->getFloat());
+  }
+
   std::optional<APValue> visit(const VectorType *VTy, CharUnits Offset) {
     QualType EltTy = VTy->getElementType();
     unsigned NElts = VTy->getNumElements();
diff --git a/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp b/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
index 7520b43a194aba..5ddb77b35ff145 100644
--- a/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
+++ b/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
@@ -511,3 +511,19 @@ constexpr bool9 bad_short_to_bool9 = __builtin_bit_cast(bool9, static_cast<unsig
 constexpr bool17 bad_int_to_bool17 = __builtin_bit_cast(bool17, 0x0001CAFEU);
 
 }
+
+namespace test_complex {
+  constexpr _Complex unsigned test_int_complex = { 0x0C05FEFE, 0xCAFEBABE };
+  static_assert(round_trip<_Complex unsigned>(0xCAFEBABE0C05FEFEULL), "");
+  static_assert(bit_cast<unsigned long long>(test_int_complex) == (LITTLE_END
+                                                                   ? 0xCAFEBABE0C05FEFE
+                                                                   : 0x0C05FEFECAFEBABE), "");
+  static_assert(sizeof(double) == 2 * sizeof(float));
+  struct TwoFloats { float A; float B; };
+  constexpr _Complex float test_float_complex = {1.0f, 2.0f};
+  constexpr TwoFloats TF = __builtin_bit_cast(TwoFloats, test_float_complex);
+  static_assert(TF.A == 1.0f && TF.B == 2.0f);
+
+  constexpr double D = __builtin_bit_cast(double, test_float_complex);
+  constexpr int M = __builtin_bit_cast(int, test_int_complex); // expected-error {{__builtin_bit_cast source size does not equal destination size}}
+}

>From 21223020ed1bb431d9812e11e0c45dcba5d31ff4 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96 at programmer.net>
Date: Wed, 16 Oct 2024 10:54:08 +0200
Subject: [PATCH 29/82] [LLVM][AArch64][NFC] Remove redundant copy parameter in
 method (#110300)

Remove redundant copy parameter in method

Fixes #94233
---
 llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 72f110cebbdc8f..85b9733e95c529 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -303,7 +303,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   void setLocalStackSize(uint64_t Size) { LocalStackSize = Size; }
   uint64_t getLocalStackSize() const { return LocalStackSize; }
 
-  void setOutliningStyle(std::string Style) { OutliningStyle = Style; }
+  void setOutliningStyle(const std::string &Style) { OutliningStyle = Style; }
   std::optional<std::string> getOutliningStyle() const {
     return OutliningStyle;
   }

>From 4ba1800be6c9294e21e2b87b64600daac12730c1 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96 at programmer.net>
Date: Wed, 16 Oct 2024 10:55:01 +0200
Subject: [PATCH 30/82] [LLVM][NFC] Reduce copying of parameter in lambda
 (#110299)

Reduce redundant copy parameter in lambda

Fixes #95642
---
 llvm/lib/Analysis/VectorUtils.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index dbffbb8a5f81d9..6b5251e0ad34eb 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1414,7 +1414,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
 
   auto InvalidateGroupIfMemberMayWrap = [&](InterleaveGroup<Instruction> *Group,
                                             int Index,
-                                            std::string FirstOrLast) -> bool {
+                                            const char *FirstOrLast) -> bool {
     Instruction *Member = Group->getMember(Index);
     assert(Member && "Group member does not exist");
     Value *MemberPtr = getLoadStorePointerOperand(Member);
@@ -1455,11 +1455,10 @@ void InterleavedAccessInfo::analyzeInterleaving(
     // So we check only group member 0 (which is always guaranteed to exist),
     // and group member Factor - 1; If the latter doesn't exist we rely on
     // peeling (if it is a non-reversed access -- see Case 3).
-    if (InvalidateGroupIfMemberMayWrap(Group, 0, std::string("first")))
+    if (InvalidateGroupIfMemberMayWrap(Group, 0, "first"))
       continue;
     if (Group->getMember(Group->getFactor() - 1))
-      InvalidateGroupIfMemberMayWrap(Group, Group->getFactor() - 1,
-                                     std::string("last"));
+      InvalidateGroupIfMemberMayWrap(Group, Group->getFactor() - 1, "last");
     else {
       // Case 3: A non-reversed interleaved load group with gaps: We need
       // to execute at least one scalar epilogue iteration. This will ensure
@@ -1503,11 +1502,11 @@ void InterleavedAccessInfo::analyzeInterleaving(
     // and the last group member. Case 3 (scalar epilog) is not relevant for
     // stores with gaps, which are implemented with masked-store (rather than
     // speculative access, as in loads).
-    if (InvalidateGroupIfMemberMayWrap(Group, 0, std::string("first")))
+    if (InvalidateGroupIfMemberMayWrap(Group, 0, "first"))
       continue;
     for (int Index = Group->getFactor() - 1; Index > 0; Index--)
       if (Group->getMember(Index)) {
-        InvalidateGroupIfMemberMayWrap(Group, Index, std::string("last"));
+        InvalidateGroupIfMemberMayWrap(Group, Index, "last");
         break;
       }
   }

>From 4c28d21f6af70ffee33660de35b263283dc32139 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 16 Oct 2024 10:00:49 +0100
Subject: [PATCH 31/82] [AArch64] Avoid single-element vector fp converts in
 streaming[-compatible] functions (#112213)

The single-element vector variants of FCVTZS, FCVTZU, UCVTF, and SCVTF
are only supported in streaming[-compatible] functions with `+sme2p2`.

Reference:
-
https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/FCVTZS--vector--integer---Floating-point-convert-to-signed-integer--rounding-toward-zero--vector--
-
https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/UCVTF--vector--integer---Unsigned-integer-convert-to-floating-point--vector--
-
https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/SCVTF--vector--integer---Signed-integer-convert-to-floating-point--vector--

Codegen will be improved in follow up patches.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  15 +-
 .../sve-streaming-mode-cvt-fp-int-fp.ll       | 121 +++++
 ...e-streaming-mode-fixed-length-int-to-fp.ll | 426 +++++++++---------
 3 files changed, 333 insertions(+), 229 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 325508b62a9f14..32f2c7c71d1757 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6237,7 +6237,8 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
 // Some float -> int -> float conversion patterns for which we want to keep the
 // int values in FP registers using the corresponding NEON instructions to
 // avoid more costly int <-> fp register transfers.
-let Predicates = [HasNEONandIsStreamingSafe] in {
+// TODO: Allow these in streaming[-compatible] functions with +sme2p2.
+let Predicates = [HasNEON] in {
 def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
           (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
 def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),
@@ -6247,7 +6248,8 @@ def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))),
 def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))),
           (UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>;
 
-let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
+// TODO: Allow these in streaming[-compatible] functions with +sme2p2.
+let Predicates = [HasNEON, HasFullFP16] in {
 def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
           (SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>;
 def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
@@ -6270,9 +6272,10 @@ def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
 
 // fp16: integer extraction from vector must be at least 32-bits to be legal.
 // Actual extraction result is then an in-reg sign-extension of lower 16-bits.
-let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
-def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract 
-                (v8i16 FPR128:$Rn), (i64 0))), i16)))), 
+// TODO: Allow these in streaming[-compatible] functions with +sme2p2.
+let Predicates = [HasNEON, HasFullFP16] in {
+def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract
+                (v8i16 FPR128:$Rn), (i64 0))), i16)))),
           (SCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
 
 // unsigned 32-bit extracted element is truncated to 16-bits using AND
@@ -6367,7 +6370,7 @@ def : Pat <(f64 (uint_to_fp (i32
                           (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
 // 64-bits -> double are handled in target specific dag combine:
 // performIntToFpCombine.
-} // let Predicates = [HasNEONandIsStreamingSafe]
+} // let Predicates = [HasNEON]
 
 //===----------------------------------------------------------------------===//
 // Advanced SIMD three different-sized vector instructions.
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
new file mode 100644
index 00000000000000..9aadf3133ba197
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc < %s | FileCheck %s --check-prefix=NON-STREAMING
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define double @t1(double %x) {
+; CHECK-LABEL: t1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs x8, d0
+; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    ret
+;
+; NON-STREAMING-LABEL: t1:
+; NON-STREAMING:       // %bb.0: // %entry
+; NON-STREAMING-NEXT:    fcvtzs d0, d0
+; NON-STREAMING-NEXT:    scvtf d0, d0
+; NON-STREAMING-NEXT:    ret
+entry:
+  %conv = fptosi double %x to i64
+  %conv1 = sitofp i64 %conv to double
+  ret double %conv1
+}
+
+define float @t2(float %x) {
+; CHECK-LABEL: t2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs w8, s0
+; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    ret
+;
+; NON-STREAMING-LABEL: t2:
+; NON-STREAMING:       // %bb.0: // %entry
+; NON-STREAMING-NEXT:    fcvtzs s0, s0
+; NON-STREAMING-NEXT:    scvtf s0, s0
+; NON-STREAMING-NEXT:    ret
+entry:
+  %conv = fptosi float %x to i32
+  %conv1 = sitofp i32 %conv to float
+  ret float %conv1
+}
+
+define half @t3(half %x)  {
+; CHECK-LABEL: t3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    fcvtzs w8, s0
+; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ret
+;
+; NON-STREAMING-LABEL: t3:
+; NON-STREAMING:       // %bb.0: // %entry
+; NON-STREAMING-NEXT:    fcvt s0, h0
+; NON-STREAMING-NEXT:    fcvtzs s0, s0
+; NON-STREAMING-NEXT:    scvtf s0, s0
+; NON-STREAMING-NEXT:    fcvt h0, s0
+; NON-STREAMING-NEXT:    ret
+entry:
+  %conv = fptosi half %x to i32
+  %conv1 = sitofp i32 %conv to half
+  ret half %conv1
+}
+
+define double @t4(double %x) {
+; CHECK-LABEL: t4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu x8, d0
+; CHECK-NEXT:    ucvtf d0, x8
+; CHECK-NEXT:    ret
+;
+; NON-STREAMING-LABEL: t4:
+; NON-STREAMING:       // %bb.0: // %entry
+; NON-STREAMING-NEXT:    fcvtzu d0, d0
+; NON-STREAMING-NEXT:    ucvtf d0, d0
+; NON-STREAMING-NEXT:    ret
+entry:
+  %conv = fptoui double %x to i64
+  %conv1 = uitofp i64 %conv to double
+  ret double %conv1
+}
+
+define float @t5(float %x) {
+; CHECK-LABEL: t5:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu w8, s0
+; CHECK-NEXT:    ucvtf s0, w8
+; CHECK-NEXT:    ret
+;
+; NON-STREAMING-LABEL: t5:
+; NON-STREAMING:       // %bb.0: // %entry
+; NON-STREAMING-NEXT:    fcvtzu s0, s0
+; NON-STREAMING-NEXT:    ucvtf s0, s0
+; NON-STREAMING-NEXT:    ret
+entry:
+  %conv = fptoui float %x to i32
+  %conv1 = uitofp i32 %conv to float
+  ret float %conv1
+}
+
+define half @t6(half %x)  {
+; CHECK-LABEL: t6:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    fcvtzu w8, s0
+; CHECK-NEXT:    ucvtf s0, w8
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ret
+;
+; NON-STREAMING-LABEL: t6:
+; NON-STREAMING:       // %bb.0: // %entry
+; NON-STREAMING-NEXT:    fcvt s0, h0
+; NON-STREAMING-NEXT:    fcvtzu s0, s0
+; NON-STREAMING-NEXT:    ucvtf s0, s0
+; NON-STREAMING-NEXT:    fcvt h0, s0
+; NON-STREAMING-NEXT:    ret
+entry:
+  %conv = fptoui half %x to i32
+  %conv1 = uitofp i32 %conv to half
+  ret half %conv1
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index afd3bb7161c155..0c712a15d4de2f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -21,20 +21,20 @@ define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
@@ -58,36 +58,36 @@ define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
@@ -115,68 +115,68 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
@@ -207,11 +207,11 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
-; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
@@ -234,15 +234,15 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) {
 ; NONEON-NOSVE-NEXT:    sub sp, sp, #32
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
 ; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #32
@@ -271,25 +271,25 @@ define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
@@ -328,47 +328,47 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #42]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #38]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #120]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #112]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #96]
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
 ; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
@@ -399,8 +399,8 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) {
 ; NONEON-NOSVE-NEXT:    sub sp, sp, #16
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
 ; NONEON-NOSVE-NEXT:    str d0, [sp]
 ; NONEON-NOSVE-NEXT:    ldr d0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
@@ -424,11 +424,11 @@ define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) {
 ; NONEON-NOSVE-NEXT:    sub sp, sp, #32
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
@@ -464,15 +464,13 @@ define void @ucvtf_v4i16_v4f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
 ; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
@@ -529,27 +527,23 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
 ; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #92]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #88]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #80]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #144]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #84]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #80]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #72]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #128]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #64]
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #72]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #68]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #64]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
 ; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
@@ -649,49 +643,42 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
 ; NONEON-NOSVE-NEXT:    str d1, [sp, #328]
 ; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #104]
-; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #160]
 ; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #176]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #160]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #152]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #156]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #152]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #144]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #148]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #144]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #136]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #140]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #136]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #332]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #192]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #332]
+; NONEON-NOSVE-NEXT:    ucvtf d1, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #328]
 ; NONEON-NOSVE-NEXT:    ldp q4, q3, [sp, #192]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #328]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #184]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #304]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #188]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #184]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #176]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #288]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #180]
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #168]
 ; NONEON-NOSVE-NEXT:    ldp q7, q6, [sp, #288]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #176]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #272]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #172]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #168]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #256]
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #224]
 ; NONEON-NOSVE-NEXT:    ldp q2, q5, [sp, #256]
@@ -1041,10 +1028,9 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) {
 ; NONEON-NOSVE-NEXT:    sub sp, sp, #32
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
 ; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #32
@@ -1073,15 +1059,13 @@ define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
@@ -1120,27 +1104,23 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
 ; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
@@ -2984,8 +2964,8 @@ define half @ucvtf_i16_f16(ptr %0) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i16_f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
@@ -2996,14 +2976,14 @@ define half @ucvtf_i16_f16(ptr %0) {
 define float @ucvtf_i16_f32(ptr %0) {
 ; CHECK-LABEL: ucvtf_i16_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr h0, [x0]
-; CHECK-NEXT:    ucvtf s0, s0
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ucvtf s0, w8
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i16_f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
 ; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = uitofp i16 %2 to float
@@ -3013,14 +2993,14 @@ define float @ucvtf_i16_f32(ptr %0) {
 define double @ucvtf_i16_f64(ptr %0) {
 ; CHECK-LABEL: ucvtf_i16_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr h0, [x0]
-; CHECK-NEXT:    ucvtf d0, d0
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ucvtf d0, w8
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i16_f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
 ; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = uitofp i16 %2 to double
@@ -3065,14 +3045,14 @@ define float @ucvtf_i32_f32(ptr %0) {
 define double @ucvtf_i32_f64(ptr %0) {
 ; CHECK-LABEL: ucvtf_i32_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ucvtf d0, d0
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ucvtf d0, w8
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i32_f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
 ; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = uitofp i32 %2 to double

>From 7c5d5c08181f399858d918d6910c021af4ec36c0 Mon Sep 17 00:00:00 2001
From: Karl-Johan Karlsson <karl-johan.karlsson at ericsson.com>
Date: Wed, 16 Oct 2024 11:01:33 +0200
Subject: [PATCH 32/82] [Sema] Fix warning in SemaOpenACC.cpp (#112481)

Fix gcc warning:

clang/lib/Sema/SemaOpenACC.cpp:2208:5: warning: this statement may fall
through [-Wimplicit-fallthrough=]
---
 clang/lib/Sema/SemaOpenACC.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 22aedbc70df8cc..d33b0d0c1c3018 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -2216,7 +2216,7 @@ ExprResult SemaOpenACC::CheckGangExpr(OpenACCGangKind GK, Expr *E) {
     case OpenACCGangKind::Static:
       return CheckGangStaticExpr(*this, E);
     }
-  }
+  } break;
   default:
     llvm_unreachable("Non compute construct in active compute construct?");
   }

>From 0eed3055511381436ee69d1caf64a4af47f8d65c Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph at gmail.com>
Date: Wed, 16 Oct 2024 11:09:17 +0200
Subject: [PATCH 33/82] Revert "[MLIR][TableGen] Use const pointers for various
 `Init` objects" (#112506)

Reverts llvm/llvm-project#112316

Bots are failing.
---
 mlir/include/mlir/TableGen/AttrOrTypeDef.h    |  2 +-
 mlir/include/mlir/TableGen/Dialect.h          |  2 +-
 mlir/include/mlir/TableGen/Operator.h         | 15 ++++----
 mlir/lib/TableGen/AttrOrTypeDef.cpp           | 12 +++---
 mlir/lib/TableGen/Attribute.cpp               |  2 +-
 mlir/lib/TableGen/Dialect.cpp                 |  2 +-
 mlir/lib/TableGen/Interfaces.cpp              |  6 +--
 mlir/lib/TableGen/Operator.cpp                | 21 +++++-----
 mlir/lib/TableGen/Pattern.cpp                 |  2 +-
 mlir/lib/TableGen/Type.cpp                    |  2 +-
 mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp | 16 ++++----
 mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp | 38 +++++++++----------
 mlir/tools/mlir-tblgen/DialectGen.cpp         |  9 ++---
 mlir/tools/mlir-tblgen/OmpOpGen.cpp           | 19 ++++------
 14 files changed, 68 insertions(+), 80 deletions(-)

diff --git a/mlir/include/mlir/TableGen/AttrOrTypeDef.h b/mlir/include/mlir/TableGen/AttrOrTypeDef.h
index c3d730e42ef70e..36744c85bc7086 100644
--- a/mlir/include/mlir/TableGen/AttrOrTypeDef.h
+++ b/mlir/include/mlir/TableGen/AttrOrTypeDef.h
@@ -105,7 +105,7 @@ class AttrOrTypeParameter {
   std::optional<StringRef> getDefaultValue() const;
 
   /// Return the underlying def of this parameter.
-  const llvm::Init *getDef() const;
+  llvm::Init *getDef() const;
 
   /// The parameter is pointer-comparable.
   bool operator==(const AttrOrTypeParameter &other) const {
diff --git a/mlir/include/mlir/TableGen/Dialect.h b/mlir/include/mlir/TableGen/Dialect.h
index ea8f40555e4451..3530d240c976c6 100644
--- a/mlir/include/mlir/TableGen/Dialect.h
+++ b/mlir/include/mlir/TableGen/Dialect.h
@@ -92,7 +92,7 @@ class Dialect {
   /// dialect.
   bool usePropertiesForAttributes() const;
 
-  const llvm::DagInit *getDiscardableAttributes() const;
+  llvm::DagInit *getDiscardableAttributes() const;
 
   const llvm::Record *getDef() const { return def; }
 
diff --git a/mlir/include/mlir/TableGen/Operator.h b/mlir/include/mlir/TableGen/Operator.h
index 9e570373d9cd32..768291a3a7267b 100644
--- a/mlir/include/mlir/TableGen/Operator.h
+++ b/mlir/include/mlir/TableGen/Operator.h
@@ -119,15 +119,14 @@ class Operator {
 
   /// A utility iterator over a list of variable decorators.
   struct VariableDecoratorIterator
-      : public llvm::mapped_iterator<const llvm::Init *const *,
-                                     VariableDecorator (*)(
-                                         const llvm::Init *)> {
+      : public llvm::mapped_iterator<llvm::Init *const *,
+                                     VariableDecorator (*)(llvm::Init *)> {
     /// Initializes the iterator to the specified iterator.
-    VariableDecoratorIterator(const llvm::Init *const *it)
-        : llvm::mapped_iterator<const llvm::Init *const *,
-                                VariableDecorator (*)(const llvm::Init *)>(
-              it, &unwrap) {}
-    static VariableDecorator unwrap(const llvm::Init *init);
+    VariableDecoratorIterator(llvm::Init *const *it)
+        : llvm::mapped_iterator<llvm::Init *const *,
+                                VariableDecorator (*)(llvm::Init *)>(it,
+                                                                     &unwrap) {}
+    static VariableDecorator unwrap(llvm::Init *init);
   };
   using var_decorator_iterator = VariableDecoratorIterator;
   using var_decorator_range = llvm::iterator_range<VariableDecoratorIterator>;
diff --git a/mlir/lib/TableGen/AttrOrTypeDef.cpp b/mlir/lib/TableGen/AttrOrTypeDef.cpp
index e72ca155bcf765..9b9d9fd2317d99 100644
--- a/mlir/lib/TableGen/AttrOrTypeDef.cpp
+++ b/mlir/lib/TableGen/AttrOrTypeDef.cpp
@@ -40,7 +40,7 @@ AttrOrTypeDef::AttrOrTypeDef(const llvm::Record *def) : def(def) {
   auto *builderList =
       dyn_cast_or_null<llvm::ListInit>(def->getValueInit("builders"));
   if (builderList && !builderList->empty()) {
-    for (const llvm::Init *init : builderList->getValues()) {
+    for (llvm::Init *init : builderList->getValues()) {
       AttrOrTypeBuilder builder(cast<llvm::DefInit>(init)->getDef(),
                                 def->getLoc());
 
@@ -58,8 +58,8 @@ AttrOrTypeDef::AttrOrTypeDef(const llvm::Record *def) : def(def) {
   if (auto *traitList = def->getValueAsListInit("traits")) {
     SmallPtrSet<const llvm::Init *, 32> traitSet;
     traits.reserve(traitSet.size());
-    llvm::unique_function<void(const llvm::ListInit *)> processTraitList =
-        [&](const llvm::ListInit *traitList) {
+    llvm::unique_function<void(llvm::ListInit *)> processTraitList =
+        [&](llvm::ListInit *traitList) {
           for (auto *traitInit : *traitList) {
             if (!traitSet.insert(traitInit).second)
               continue;
@@ -335,9 +335,7 @@ std::optional<StringRef> AttrOrTypeParameter::getDefaultValue() const {
   return result && !result->empty() ? result : std::nullopt;
 }
 
-const llvm::Init *AttrOrTypeParameter::getDef() const {
-  return def->getArg(index);
-}
+llvm::Init *AttrOrTypeParameter::getDef() const { return def->getArg(index); }
 
 std::optional<Constraint> AttrOrTypeParameter::getConstraint() const {
   if (auto *param = dyn_cast<llvm::DefInit>(getDef()))
@@ -351,7 +349,7 @@ std::optional<Constraint> AttrOrTypeParameter::getConstraint() const {
 //===----------------------------------------------------------------------===//
 
 bool AttributeSelfTypeParameter::classof(const AttrOrTypeParameter *param) {
-  const llvm::Init *paramDef = param->getDef();
+  llvm::Init *paramDef = param->getDef();
   if (auto *paramDefInit = dyn_cast<llvm::DefInit>(paramDef))
     return paramDefInit->getDef()->isSubClassOf("AttributeSelfTypeParameter");
   return false;
diff --git a/mlir/lib/TableGen/Attribute.cpp b/mlir/lib/TableGen/Attribute.cpp
index 887553bca66102..de930cb4007032 100644
--- a/mlir/lib/TableGen/Attribute.cpp
+++ b/mlir/lib/TableGen/Attribute.cpp
@@ -126,7 +126,7 @@ StringRef Attribute::getDerivedCodeBody() const {
 Dialect Attribute::getDialect() const {
   const llvm::RecordVal *record = def->getValue("dialect");
   if (record && record->getValue()) {
-    if (const DefInit *init = dyn_cast<DefInit>(record->getValue()))
+    if (DefInit *init = dyn_cast<DefInit>(record->getValue()))
       return Dialect(init->getDef());
   }
   return Dialect(nullptr);
diff --git a/mlir/lib/TableGen/Dialect.cpp b/mlir/lib/TableGen/Dialect.cpp
index ef39818e439b3e..081f6e56f9ded4 100644
--- a/mlir/lib/TableGen/Dialect.cpp
+++ b/mlir/lib/TableGen/Dialect.cpp
@@ -106,7 +106,7 @@ bool Dialect::usePropertiesForAttributes() const {
   return def->getValueAsBit("usePropertiesForAttributes");
 }
 
-const llvm::DagInit *Dialect::getDiscardableAttributes() const {
+llvm::DagInit *Dialect::getDiscardableAttributes() const {
   return def->getValueAsDag("discardableAttrs");
 }
 
diff --git a/mlir/lib/TableGen/Interfaces.cpp b/mlir/lib/TableGen/Interfaces.cpp
index 4a6709a43d0a8f..a209b003b0f3bb 100644
--- a/mlir/lib/TableGen/Interfaces.cpp
+++ b/mlir/lib/TableGen/Interfaces.cpp
@@ -22,7 +22,7 @@ using namespace mlir::tblgen;
 //===----------------------------------------------------------------------===//
 
 InterfaceMethod::InterfaceMethod(const llvm::Record *def) : def(def) {
-  const llvm::DagInit *args = def->getValueAsDag("arguments");
+  llvm::DagInit *args = def->getValueAsDag("arguments");
   for (unsigned i = 0, e = args->getNumArgs(); i != e; ++i) {
     arguments.push_back(
         {llvm::cast<llvm::StringInit>(args->getArg(i))->getValue(),
@@ -78,7 +78,7 @@ Interface::Interface(const llvm::Record *def) : def(def) {
 
   // Initialize the interface methods.
   auto *listInit = dyn_cast<llvm::ListInit>(def->getValueInit("methods"));
-  for (const llvm::Init *init : listInit->getValues())
+  for (llvm::Init *init : listInit->getValues())
     methods.emplace_back(cast<llvm::DefInit>(init)->getDef());
 
   // Initialize the interface base classes.
@@ -98,7 +98,7 @@ Interface::Interface(const llvm::Record *def) : def(def) {
         baseInterfaces.push_back(std::make_unique<Interface>(baseInterface));
         basesAdded.insert(baseInterface.getName());
       };
-  for (const llvm::Init *init : basesInit->getValues())
+  for (llvm::Init *init : basesInit->getValues())
     addBaseInterfaceFn(Interface(cast<llvm::DefInit>(init)->getDef()));
 }
 
diff --git a/mlir/lib/TableGen/Operator.cpp b/mlir/lib/TableGen/Operator.cpp
index 86670e9f87127c..6a33ff5ecd6721 100644
--- a/mlir/lib/TableGen/Operator.cpp
+++ b/mlir/lib/TableGen/Operator.cpp
@@ -161,7 +161,7 @@ std::string Operator::getQualCppClassName() const {
 StringRef Operator::getCppNamespace() const { return cppNamespace; }
 
 int Operator::getNumResults() const {
-  const DagInit *results = def.getValueAsDag("results");
+  DagInit *results = def.getValueAsDag("results");
   return results->getNumArgs();
 }
 
@@ -198,12 +198,12 @@ auto Operator::getResults() const -> const_value_range {
 }
 
 TypeConstraint Operator::getResultTypeConstraint(int index) const {
-  const DagInit *results = def.getValueAsDag("results");
+  DagInit *results = def.getValueAsDag("results");
   return TypeConstraint(cast<DefInit>(results->getArg(index)));
 }
 
 StringRef Operator::getResultName(int index) const {
-  const DagInit *results = def.getValueAsDag("results");
+  DagInit *results = def.getValueAsDag("results");
   return results->getArgNameStr(index);
 }
 
@@ -241,7 +241,7 @@ Operator::arg_range Operator::getArgs() const {
 }
 
 StringRef Operator::getArgName(int index) const {
-  const DagInit *argumentValues = def.getValueAsDag("arguments");
+  DagInit *argumentValues = def.getValueAsDag("arguments");
   return argumentValues->getArgNameStr(index);
 }
 
@@ -557,7 +557,7 @@ void Operator::populateOpStructure() {
   auto *opVarClass = recordKeeper.getClass("OpVariable");
   numNativeAttributes = 0;
 
-  const DagInit *argumentValues = def.getValueAsDag("arguments");
+  DagInit *argumentValues = def.getValueAsDag("arguments");
   unsigned numArgs = argumentValues->getNumArgs();
 
   // Mapping from name of to argument or result index. Arguments are indexed
@@ -721,8 +721,8 @@ void Operator::populateOpStructure() {
                   " to precede it in traits list");
     };
 
-    std::function<void(const llvm::ListInit *)> insert;
-    insert = [&](const llvm::ListInit *traitList) {
+    std::function<void(llvm::ListInit *)> insert;
+    insert = [&](llvm::ListInit *traitList) {
       for (auto *traitInit : *traitList) {
         auto *def = cast<DefInit>(traitInit)->getDef();
         if (def->isSubClassOf("TraitList")) {
@@ -780,7 +780,7 @@ void Operator::populateOpStructure() {
   auto *builderList =
       dyn_cast_or_null<llvm::ListInit>(def.getValueInit("builders"));
   if (builderList && !builderList->empty()) {
-    for (const llvm::Init *init : builderList->getValues())
+    for (llvm::Init *init : builderList->getValues())
       builders.emplace_back(cast<llvm::DefInit>(init)->getDef(), def.getLoc());
   } else if (skipDefaultBuilders()) {
     PrintFatalError(
@@ -818,8 +818,7 @@ bool Operator::hasAssemblyFormat() const {
 }
 
 StringRef Operator::getAssemblyFormat() const {
-  return TypeSwitch<const llvm::Init *, StringRef>(
-             def.getValueInit("assemblyFormat"))
+  return TypeSwitch<llvm::Init *, StringRef>(def.getValueInit("assemblyFormat"))
       .Case<llvm::StringInit>([&](auto *init) { return init->getValue(); });
 }
 
@@ -833,7 +832,7 @@ void Operator::print(llvm::raw_ostream &os) const {
   }
 }
 
-auto Operator::VariableDecoratorIterator::unwrap(const llvm::Init *init)
+auto Operator::VariableDecoratorIterator::unwrap(llvm::Init *init)
     -> VariableDecorator {
   return VariableDecorator(cast<llvm::DefInit>(init)->getDef());
 }
diff --git a/mlir/lib/TableGen/Pattern.cpp b/mlir/lib/TableGen/Pattern.cpp
index bee20354387fd6..6437839ef20849 100644
--- a/mlir/lib/TableGen/Pattern.cpp
+++ b/mlir/lib/TableGen/Pattern.cpp
@@ -700,7 +700,7 @@ int Pattern::getBenefit() const {
   // The initial benefit value is a heuristic with number of ops in the source
   // pattern.
   int initBenefit = getSourcePattern().getNumOps();
-  const llvm::DagInit *delta = def.getValueAsDag("benefitDelta");
+  llvm::DagInit *delta = def.getValueAsDag("benefitDelta");
   if (delta->getNumArgs() != 1 || !isa<llvm::IntInit>(delta->getArg(0))) {
     PrintFatalError(&def,
                     "The 'addBenefit' takes and only takes one integer value");
diff --git a/mlir/lib/TableGen/Type.cpp b/mlir/lib/TableGen/Type.cpp
index c3b813ec598d0a..cda752297988bb 100644
--- a/mlir/lib/TableGen/Type.cpp
+++ b/mlir/lib/TableGen/Type.cpp
@@ -50,7 +50,7 @@ std::optional<StringRef> TypeConstraint::getBuilderCall() const {
   const llvm::RecordVal *builderCall = baseType->getValue("builderCall");
   if (!builderCall || !builderCall->getValue())
     return std::nullopt;
-  return TypeSwitch<const llvm::Init *, std::optional<StringRef>>(
+  return TypeSwitch<llvm::Init *, std::optional<StringRef>>(
              builderCall->getValue())
       .Case<llvm::StringInit>([&](auto *init) {
         StringRef value = init->getValue();
diff --git a/mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp b/mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp
index 20ad4292a548bf..7119324dd125d5 100644
--- a/mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp
+++ b/mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp
@@ -30,8 +30,8 @@ enum DeprecatedAction { None, Warn, Error };
 static DeprecatedAction actionOnDeprecatedValue;
 
 // Returns if there is a use of `deprecatedInit` in `field`.
-static bool findUse(const Init *field, const Init *deprecatedInit,
-                    llvm::DenseMap<const Init *, bool> &known) {
+static bool findUse(Init *field, Init *deprecatedInit,
+                    llvm::DenseMap<Init *, bool> &known) {
   if (field == deprecatedInit)
     return true;
 
@@ -64,13 +64,13 @@ static bool findUse(const Init *field, const Init *deprecatedInit,
     if (findUse(dagInit->getOperator(), deprecatedInit, known))
       return memoize(true);
 
-    return memoize(llvm::any_of(dagInit->getArgs(), [&](const Init *arg) {
+    return memoize(llvm::any_of(dagInit->getArgs(), [&](Init *arg) {
       return findUse(arg, deprecatedInit, known);
     }));
   }
 
-  if (const ListInit *li = dyn_cast<ListInit>(field)) {
-    return memoize(llvm::any_of(li->getValues(), [&](const Init *jt) {
+  if (ListInit *li = dyn_cast<ListInit>(field)) {
+    return memoize(llvm::any_of(li->getValues(), [&](Init *jt) {
       return findUse(jt, deprecatedInit, known);
     }));
   }
@@ -83,8 +83,8 @@ static bool findUse(const Init *field, const Init *deprecatedInit,
 }
 
 // Returns if there is a use of `deprecatedInit` in `record`.
-static bool findUse(Record &record, const Init *deprecatedInit,
-                    llvm::DenseMap<const Init *, bool> &known) {
+static bool findUse(Record &record, Init *deprecatedInit,
+                    llvm::DenseMap<Init *, bool> &known) {
   return llvm::any_of(record.getValues(), [&](const RecordVal &val) {
     return findUse(val.getValue(), deprecatedInit, known);
   });
@@ -100,7 +100,7 @@ static void warnOfDeprecatedUses(const RecordKeeper &records) {
     if (!r || !r->getValue())
       continue;
 
-    llvm::DenseMap<const Init *, bool> hasUse;
+    llvm::DenseMap<Init *, bool> hasUse;
     if (auto *si = dyn_cast<StringInit>(r->getValue())) {
       for (auto &jt : records.getDefs()) {
         // Skip anonymous defs.
diff --git a/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp b/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp
index 6a3d5a25e28cd9..86ebaf2cf27dfe 100644
--- a/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp
+++ b/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp
@@ -46,9 +46,8 @@ class Generator {
 private:
   /// Emits parse calls to construct given kind.
   void emitParseHelper(StringRef kind, StringRef returnType, StringRef builder,
-                       ArrayRef<const Init *> args,
-                       ArrayRef<std::string> argNames, StringRef failure,
-                       mlir::raw_indented_ostream &ios);
+                       ArrayRef<Init *> args, ArrayRef<std::string> argNames,
+                       StringRef failure, mlir::raw_indented_ostream &ios);
 
   /// Emits print instructions.
   void emitPrintHelper(const Record *memberRec, StringRef kind,
@@ -136,12 +135,10 @@ void Generator::emitParse(StringRef kind, const Record &x) {
       R"(static {0} read{1}(MLIRContext* context, DialectBytecodeReader &reader) )";
   mlir::raw_indented_ostream os(output);
   std::string returnType = getCType(&x);
-  os << formatv(head,
-                kind == "attribute" ? "::mlir::Attribute" : "::mlir::Type",
-                x.getName());
-  const DagInit *members = x.getValueAsDag("members");
-  SmallVector<std::string> argNames = llvm::to_vector(
-      map_range(members->getArgNames(), [](const StringInit *init) {
+  os << formatv(head, kind == "attribute" ? "::mlir::Attribute" : "::mlir::Type", x.getName());
+  DagInit *members = x.getValueAsDag("members");
+  SmallVector<std::string> argNames =
+      llvm::to_vector(map_range(members->getArgNames(), [](StringInit *init) {
         return init->getAsUnquotedString();
       }));
   StringRef builder = x.getValueAsString("cBuilder").trim();
@@ -151,7 +148,7 @@ void Generator::emitParse(StringRef kind, const Record &x) {
 }
 
 void printParseConditional(mlir::raw_indented_ostream &ios,
-                           ArrayRef<const Init *> args,
+                           ArrayRef<Init *> args,
                            ArrayRef<std::string> argNames) {
   ios << "if ";
   auto parenScope = ios.scope("(", ") {");
@@ -162,7 +159,7 @@ void printParseConditional(mlir::raw_indented_ostream &ios,
   };
 
   auto parsedArgs =
-      llvm::to_vector(make_filter_range(args, [](const Init *const attr) {
+      llvm::to_vector(make_filter_range(args, [](Init *const attr) {
         const Record *def = cast<DefInit>(attr)->getDef();
         if (def->isSubClassOf("Array"))
           return true;
@@ -171,7 +168,7 @@ void printParseConditional(mlir::raw_indented_ostream &ios,
 
   interleave(
       zip(parsedArgs, argNames),
-      [&](std::tuple<const Init *&, const std::string &> it) {
+      [&](std::tuple<llvm::Init *&, const std::string &> it) {
         const Record *attr = cast<DefInit>(std::get<0>(it))->getDef();
         std::string parser;
         if (auto optParser = attr->getValueAsOptionalString("cParser")) {
@@ -199,7 +196,7 @@ void printParseConditional(mlir::raw_indented_ostream &ios,
 }
 
 void Generator::emitParseHelper(StringRef kind, StringRef returnType,
-                                StringRef builder, ArrayRef<const Init *> args,
+                                StringRef builder, ArrayRef<Init *> args,
                                 ArrayRef<std::string> argNames,
                                 StringRef failure,
                                 mlir::raw_indented_ostream &ios) {
@@ -213,7 +210,7 @@ void Generator::emitParseHelper(StringRef kind, StringRef returnType,
   // Print decls.
   std::string lastCType = "";
   for (auto [arg, name] : zip(args, argNames)) {
-    const DefInit *first = dyn_cast<DefInit>(arg);
+    DefInit *first = dyn_cast<DefInit>(arg);
     if (!first)
       PrintFatalError("Unexpected type for " + name);
     const Record *def = first->getDef();
@@ -254,14 +251,13 @@ void Generator::emitParseHelper(StringRef kind, StringRef returnType,
     std::string returnType = getCType(def);
     ios << "auto " << listHelperName(name) << " = [&]() -> FailureOr<"
         << returnType << "> ";
-    SmallVector<const Init *> args;
+    SmallVector<Init *> args;
     SmallVector<std::string> argNames;
     if (def->isSubClassOf("CompositeBytecode")) {
-      const DagInit *members = def->getValueAsDag("members");
-      args = llvm::to_vector(map_range(
-          members->getArgs(), [](Init *init) { return (const Init *)init; }));
+      DagInit *members = def->getValueAsDag("members");
+      args = llvm::to_vector(members->getArgs());
       argNames = llvm::to_vector(
-          map_range(members->getArgNames(), [](const StringInit *init) {
+          map_range(members->getArgNames(), [](StringInit *init) {
             return init->getAsUnquotedString();
           }));
     } else {
@@ -336,7 +332,7 @@ void Generator::emitPrint(StringRef kind, StringRef type,
     auto *members = rec->getValueAsDag("members");
     for (auto [arg, name] :
          llvm::zip(members->getArgs(), members->getArgNames())) {
-      const DefInit *def = dyn_cast<DefInit>(arg);
+      DefInit *def = dyn_cast<DefInit>(arg);
       assert(def);
       const Record *memberRec = def->getDef();
       emitPrintHelper(memberRec, kind, kind, name->getAsUnquotedString(), os);
@@ -389,7 +385,7 @@ void Generator::emitPrintHelper(const Record *memberRec, StringRef kind,
     auto *members = memberRec->getValueAsDag("members");
     for (auto [arg, argName] :
          zip(members->getArgs(), members->getArgNames())) {
-      const DefInit *def = dyn_cast<DefInit>(arg);
+      DefInit *def = dyn_cast<DefInit>(arg);
       assert(def);
       emitPrintHelper(def->getDef(), kind, parent,
                       argName->getAsUnquotedString(), ios);
diff --git a/mlir/tools/mlir-tblgen/DialectGen.cpp b/mlir/tools/mlir-tblgen/DialectGen.cpp
index 414cad5e1dcc2e..55c3d9da259005 100644
--- a/mlir/tools/mlir-tblgen/DialectGen.cpp
+++ b/mlir/tools/mlir-tblgen/DialectGen.cpp
@@ -46,10 +46,10 @@ using DialectFilterIterator =
 } // namespace
 
 static void populateDiscardableAttributes(
-    Dialect &dialect, const llvm::DagInit *discardableAttrDag,
+    Dialect &dialect, llvm::DagInit *discardableAttrDag,
     SmallVector<std::pair<std::string, std::string>> &discardableAttributes) {
   for (int i : llvm::seq<int>(0, discardableAttrDag->getNumArgs())) {
-    const llvm::Init *arg = discardableAttrDag->getArg(i);
+    llvm::Init *arg = discardableAttrDag->getArg(i);
 
     StringRef givenName = discardableAttrDag->getArgNameStr(i);
     if (givenName.empty())
@@ -271,8 +271,7 @@ static void emitDialectDecl(Dialect &dialect, raw_ostream &os) {
     if (dialect.hasOperationInterfaceFallback())
       os << operationInterfaceFallbackDecl;
 
-    const llvm::DagInit *discardableAttrDag =
-        dialect.getDiscardableAttributes();
+    llvm::DagInit *discardableAttrDag = dialect.getDiscardableAttributes();
     SmallVector<std::pair<std::string, std::string>> discardableAttributes;
     populateDiscardableAttributes(dialect, discardableAttrDag,
                                   discardableAttributes);
@@ -371,7 +370,7 @@ static void emitDialectDef(Dialect &dialect, const RecordKeeper &records,
   StringRef superClassName =
       dialect.isExtensible() ? "ExtensibleDialect" : "Dialect";
 
-  const llvm::DagInit *discardableAttrDag = dialect.getDiscardableAttributes();
+  llvm::DagInit *discardableAttrDag = dialect.getDiscardableAttributes();
   SmallVector<std::pair<std::string, std::string>> discardableAttributes;
   populateDiscardableAttributes(dialect, discardableAttrDag,
                                 discardableAttributes);
diff --git a/mlir/tools/mlir-tblgen/OmpOpGen.cpp b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
index 8716667723a373..1c20a6a9bcf4e8 100644
--- a/mlir/tools/mlir-tblgen/OmpOpGen.cpp
+++ b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
@@ -102,13 +102,11 @@ static StringRef extractOmpClauseName(const Record *clause) {
 
 /// Check that the given argument, identified by its name and initialization
 /// value, is present in the \c arguments `dag`.
-static bool verifyArgument(const DagInit *arguments, StringRef argName,
-                           const Init *argInit) {
+static bool verifyArgument(DagInit *arguments, StringRef argName,
+                           Init *argInit) {
   auto range = zip_equal(arguments->getArgNames(), arguments->getArgs());
   return llvm::any_of(
-      range,
-      [&](std::tuple<const llvm::StringInit *const &, const llvm::Init *const &>
-              v) {
+      range, [&](std::tuple<llvm::StringInit *const &, llvm::Init *const &> v) {
         return std::get<0>(v)->getAsUnquotedString() == argName &&
                std::get<1>(v) == argInit;
       });
@@ -143,8 +141,8 @@ static void verifyClause(const Record *op, const Record *clause) {
   StringRef clauseClassName = extractOmpClauseName(clause);
 
   if (!clause->getValueAsBit("ignoreArgs")) {
-    const DagInit *opArguments = op->getValueAsDag("arguments");
-    const DagInit *arguments = clause->getValueAsDag("arguments");
+    DagInit *opArguments = op->getValueAsDag("arguments");
+    DagInit *arguments = clause->getValueAsDag("arguments");
 
     for (auto [name, arg] :
          zip(arguments->getArgNames(), arguments->getArgs())) {
@@ -210,9 +208,8 @@ static void verifyClause(const Record *op, const Record *clause) {
 ///
 /// \return the name of the base type to represent elements of the argument
 ///         type.
-static StringRef translateArgumentType(ArrayRef<SMLoc> loc,
-                                       const StringInit *name, const Init *init,
-                                       int &nest, int &rank) {
+static StringRef translateArgumentType(ArrayRef<SMLoc> loc, StringInit *name,
+                                       Init *init, int &nest, int &rank) {
   const Record *def = cast<DefInit>(init)->getDef();
 
   llvm::StringSet<> superClasses;
@@ -285,7 +282,7 @@ static void genClauseOpsStruct(const Record *clause, raw_ostream &os) {
   StringRef clauseName = extractOmpClauseName(clause);
   os << "struct " << clauseName << "ClauseOps {\n";
 
-  const DagInit *arguments = clause->getValueAsDag("arguments");
+  DagInit *arguments = clause->getValueAsDag("arguments");
   for (auto [name, arg] :
        zip_equal(arguments->getArgNames(), arguments->getArgs())) {
     int nest = 0, rank = 1;

>From 1d40fefb08e9b11b72bf40274aa7839ae9f7fe07 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 15 Oct 2024 23:36:10 +0100
Subject: [PATCH 34/82] [RISCV] Add zvfhmin/zvfbfmin cost model tests for
 libcall ops. NFC

---
 .../Analysis/CostModel/RISCV/fp-sqrt-pow.ll   | 207 +++--
 .../CostModel/RISCV/fp-trig-log-exp.ll        | 805 ++++++++++++------
 2 files changed, 655 insertions(+), 357 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
index 78acba832dbbf2..1768222b8a92e5 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
@@ -1,17 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
 
 define void @sqrt() {
 ; CHECK-LABEL: 'sqrt'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sqrt.f16(half undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.sqrt.nxv2f16(<vscale x 2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.sqrt.nxv4f16(<vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 16 x bfloat> @llvm.sqrt.nxv16bf16(<vscale x 16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call float @llvm.sqrt.f32(float undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
@@ -33,15 +34,15 @@ define void @sqrt() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = call <vscale x 8 x double> @llvm.sqrt.nxv8f64(<vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  call half @llvm.sqrt.f16(half undef)
-  call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
-  call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
-  call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
-  call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
-  call <vscale x 2 x half> @llvm.sqrt.nvx2f16(<vscale x 2 x half> undef)
-  call <vscale x 4 x half> @llvm.sqrt.nvx4f16(<vscale x 4 x half> undef)
-  call <vscale x 8 x half> @llvm.sqrt.nvx8f16(<vscale x 8 x half> undef)
-  call <vscale x 16 x half> @llvm.sqrt.nvx16f16(<vscale x 16 x half> undef)
+  call bfloat @llvm.sqrt.bf16(bfloat undef)
+  call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+  call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+  call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.sqrt.nvx2bf16(<vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.sqrt.nvx4bf16(<vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.sqrt.nvx8bf16(<vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.sqrt.nvx16bf16(<vscale x 16 x bfloat> undef)
   call float @llvm.sqrt.f32(float undef)
   call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
   call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
@@ -64,58 +65,74 @@ define void @sqrt() {
   ret void
 }
 
-declare half @llvm.sqrt.f16(half)
-declare <2 x half> @llvm.sqrt.v2f16(<2 x half>)
-declare <4 x half> @llvm.sqrt.v4f16(<4 x half>)
-declare <8 x half> @llvm.sqrt.v8f16(<8 x half>)
-declare <16 x half> @llvm.sqrt.v16f16(<16 x half>)
-declare <vscale x 2 x half> @llvm.sqrt.nvx2f16(<vscale x 2 x half>)
-declare <vscale x 4 x half> @llvm.sqrt.nvx4f16(<vscale x 4 x half>)
-declare <vscale x 8 x half> @llvm.sqrt.nvx8f16(<vscale x 8 x half>)
-declare <vscale x 16 x half> @llvm.sqrt.nvx16f16(<vscale x 16 x half>)
-declare float @llvm.sqrt.f32(float)
-declare <2 x float> @llvm.sqrt.v2f32(<2 x float>)
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
-declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
-declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.sqrt.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.sqrt.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.sqrt.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.sqrt.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.sqrt.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.sqrt.f64(double)
-declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
-declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
-declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
-declare <16 x double> @llvm.sqrt.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.sqrt.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.sqrt.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.sqrt.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.sqrt.nvx8f64(<vscale x 8 x double>)
+define void @sqrt_f16() {
+; CHECK-LABEL: 'sqrt_f16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sqrt.f16(half undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.sqrt.nxv2f16(<vscale x 2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.sqrt.nxv4f16(<vscale x 4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call half @llvm.sqrt.f16(half undef)
+  call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+  call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+  call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+  call <vscale x 2 x half> @llvm.sqrt.nvx2f16(<vscale x 2 x half> undef)
+  call <vscale x 4 x half> @llvm.sqrt.nvx4f16(<vscale x 4 x half> undef)
+  call <vscale x 8 x half> @llvm.sqrt.nvx8f16(<vscale x 8 x half> undef)
+  call <vscale x 16 x half> @llvm.sqrt.nvx16f16(<vscale x 16 x half> undef)
+  ret void
+}
 
 define void @pow() {
 ; CHECK-LABEL: 'pow'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.pow.f32(float undef, float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.pow.v2f32(<2 x float> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.pow.v4f32(<4 x float> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.pow.v8f32(<8 x float> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.pow.v16f32(<16 x float> undef, <16 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.pow.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.pow.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.pow.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.pow.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.pow.f64(double undef, double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.pow.v2f64(<2 x double> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.pow.v4f64(<4 x double> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.pow.v8f64(<8 x double> undef, <8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.pow.v16f64(<16 x double> undef, <16 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.pow.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.pow.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.pow.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.pow.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.pow.bf16(bfloat undef, bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.pow.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.pow.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.pow.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.pow.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.pow.nxv1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.pow.nxv2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.pow.nxv4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.pow.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.pow.nxv16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.pow.f32(float undef, float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.pow.v2f32(<2 x float> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.pow.v4f32(<4 x float> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.pow.v8f32(<8 x float> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.pow.v16f32(<16 x float> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.pow.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.pow.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.pow.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.pow.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.pow.f64(double undef, double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.pow.v2f64(<2 x double> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.pow.v4f64(<4 x double> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.pow.v8f64(<8 x double> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.pow.v16f64(<16 x double> undef, <16 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.pow.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.pow.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.pow.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.pow.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  call bfloat @llvm.pow.bf16(bfloat undef, bfloat undef)
+  call <2 x bfloat> @llvm.pow.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+  call <4 x bfloat> @llvm.pow.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+  call <8 x bfloat> @llvm.pow.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+  call <16 x bfloat> @llvm.pow.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+  call <vscale x 1 x bfloat> @llvm.pow.nvx1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.pow.nvx2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.pow.nvx4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.pow.nvx8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.pow.nvx16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef)
   call float @llvm.pow.f32(float undef, float undef)
   call <2 x float> @llvm.pow.v2f32(<2 x float> undef, <2 x float> undef)
   call <4 x float> @llvm.pow.v4f32(<4 x float> undef, <4 x float> undef)
@@ -138,22 +155,42 @@ define void @pow() {
   ret void
 }
 
-declare float @llvm.pow.f32(float, float)
-declare <2 x float> @llvm.pow.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.pow.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.pow.v8f32(<8 x float>, <8 x float>)
-declare <16 x float> @llvm.pow.v16f32(<16 x float>, <16 x float>)
-declare <vscale x 1 x float> @llvm.pow.nvx1f32(<vscale x 1 x float>, <vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.pow.nvx2f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.pow.nvx4f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.pow.nvx8f32(<vscale x 8 x float>, <vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.pow.nvx16f32(<vscale x 16 x float>, <vscale x 16 x float>)
-declare double @llvm.pow.f64(double, double)
-declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.pow.v4f64(<4 x double>, <4 x double>)
-declare <8 x double> @llvm.pow.v8f64(<8 x double>, <8 x double>)
-declare <16 x double> @llvm.pow.v16f64(<16 x double>, <16 x double>)
-declare <vscale x 1 x double> @llvm.pow.nvx1f64(<vscale x 1 x double>, <vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.pow.nvx2f64(<vscale x 2 x double>, <vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.pow.nvx4f64(<vscale x 4 x double>, <vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.pow.nvx8f64(<vscale x 8 x double>, <vscale x 8 x double>)
+define void @pow_f16() {
+; ZVFH-LABEL: 'pow_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.pow.f16(half undef, half undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.pow.v2f16(<2 x half> undef, <2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.pow.v4f16(<4 x half> undef, <4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.pow.v8f16(<8 x half> undef, <8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.pow.v16f16(<16 x half> undef, <16 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.pow.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.pow.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.pow.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.pow.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x half> @llvm.pow.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'pow_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.pow.f16(half undef, half undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.pow.v2f16(<2 x half> undef, <2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.pow.v4f16(<4 x half> undef, <4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.pow.v8f16(<8 x half> undef, <8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.pow.v16f16(<16 x half> undef, <16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.pow.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.pow.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.pow.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.pow.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x half> @llvm.pow.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call half @llvm.pow.f16(half undef, half undef)
+  call <2 x half> @llvm.pow.v2f16(<2 x half> undef, <2 x half> undef)
+  call <4 x half> @llvm.pow.v4f16(<4 x half> undef, <4 x half> undef)
+  call <8 x half> @llvm.pow.v8f16(<8 x half> undef, <8 x half> undef)
+  call <16 x half> @llvm.pow.v16f16(<16 x half> undef, <16 x half> undef)
+  call <vscale x 1 x half> @llvm.pow.nvx1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef)
+  call <vscale x 2 x half> @llvm.pow.nvx2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef)
+  call <vscale x 4 x half> @llvm.pow.nvx4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef)
+  call <vscale x 8 x half> @llvm.pow.nvx8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef)
+  call <vscale x 16 x half> @llvm.pow.nvx16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll b/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll
index af779116a62780..d65fa43b89952e 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll
@@ -1,29 +1,50 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
 
 define void @sin() {
 ; CHECK-LABEL: 'sin'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.sin.f32(float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.sin.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.sin.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.sin.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.sin.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.sin.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.sin.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.sin.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.sin.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.sin.f64(double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.sin.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.sin.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.sin.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.sin.v16f64(<16 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.sin.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.sin.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.sin.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.sin.bf16(bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.sin.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.sin.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.sin.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.sin.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.sin.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.sin.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.sin.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.sin.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.sin.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.sin.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.sin.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.sin.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.sin.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.sin.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.sin.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.sin.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.sin.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.sin.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.sin.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.sin.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.sin.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.sin.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.sin.v16f64(<16 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.sin.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.sin.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.sin.nxv8f64(<vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  call bfloat @llvm.sin.bf16(bfloat undef)
+  call <2 x bfloat> @llvm.sin.v2bf16(<2 x bfloat> undef)
+  call <4 x bfloat> @llvm.sin.v4bf16(<4 x bfloat> undef)
+  call <8 x bfloat> @llvm.sin.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.sin.v16bf16(<16 x bfloat> undef)
+  call <vscale x 1 x bfloat> @llvm.sin.nvx1bf16(<vscale x 1 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.sin.nvx2bf16(<vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.sin.nvx4bf16(<vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.sin.nvx8bf16(<vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.sin.nvx16bf16(<vscale x 16 x bfloat> undef)
   call float @llvm.sin.f32(float undef)
   call <2 x float> @llvm.sin.v2f32(<2 x float> undef)
   call <4 x float> @llvm.sin.v4f32(<4 x float> undef)
@@ -46,29 +67,86 @@ define void @sin() {
   ret void
 }
 
+define void @sin_f16() {
+; ZVFH-LABEL: 'sin_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sin.f16(half undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.sin.v2f16(<2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.sin.v4f16(<4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.sin.v8f16(<8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.sin.v16f16(<16 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.sin.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.sin.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.sin.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.sin.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'sin_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.sin.f16(half undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.sin.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.sin.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.sin.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.sin.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.sin.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.sin.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.sin.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.sin.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call half @llvm.sin.f16(half undef)
+  call <2 x half> @llvm.sin.v2f16(<2 x half> undef)
+  call <4 x half> @llvm.sin.v4f16(<4 x half> undef)
+  call <8 x half> @llvm.sin.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.sin.v16f16(<16 x half> undef)
+  call <vscale x 1 x half> @llvm.sin.nvx1f16(<vscale x 1 x half> undef)
+  call <vscale x 2 x half> @llvm.sin.nvx2f16(<vscale x 2 x half> undef)
+  call <vscale x 4 x half> @llvm.sin.nvx4f16(<vscale x 4 x half> undef)
+  call <vscale x 8 x half> @llvm.sin.nvx8f16(<vscale x 8 x half> undef)
+  ret void
+}
+
 define void @cos() {
 ; CHECK-LABEL: 'cos'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.cos.f32(float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.cos.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.cos.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.cos.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.cos.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.cos.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.cos.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.cos.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.cos.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.cos.f64(double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.cos.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.cos.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.cos.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.cos.v16f64(<16 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.cos.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.cos.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.cos.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.cos.bf16(bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.cos.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.cos.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.cos.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.cos.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.cos.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.cos.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.cos.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.cos.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.cos.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.cos.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.cos.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.cos.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.cos.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.cos.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.cos.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.cos.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.cos.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.cos.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.cos.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.cos.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.cos.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.cos.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.cos.v16f64(<16 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.cos.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.cos.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.cos.nxv8f64(<vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  call bfloat @llvm.cos.bf16(bfloat undef)
+  call <2 x bfloat> @llvm.cos.v2bf16(<2 x bfloat> undef)
+  call <4 x bfloat> @llvm.cos.v4bf16(<4 x bfloat> undef)
+  call <8 x bfloat> @llvm.cos.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.cos.v16bf16(<16 x bfloat> undef)
+  call <vscale x 1 x bfloat> @llvm.cos.nvx1bf16(<vscale x 1 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.cos.nvx2bf16(<vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.cos.nvx4bf16(<vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.cos.nvx8bf16(<vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.cos.nvx16bf16(<vscale x 16 x bfloat> undef)
   call float @llvm.cos.f32(float undef)
   call <2 x float> @llvm.cos.v2f32(<2 x float> undef)
   call <4 x float> @llvm.cos.v4f32(<4 x float> undef)
@@ -91,29 +169,86 @@ define void @cos() {
   ret void
 }
 
+define void @cos_f16() {
+; ZVFH-LABEL: 'cos_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.cos.f16(half undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.cos.v2f16(<2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.cos.v4f16(<4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.cos.v8f16(<8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.cos.v16f16(<16 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.cos.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.cos.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.cos.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.cos.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'cos_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.cos.f16(half undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.cos.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.cos.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.cos.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.cos.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.cos.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.cos.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.cos.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.cos.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call half @llvm.cos.f16(half undef)
+  call <2 x half> @llvm.cos.v2f16(<2 x half> undef)
+  call <4 x half> @llvm.cos.v4f16(<4 x half> undef)
+  call <8 x half> @llvm.cos.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.cos.v16f16(<16 x half> undef)
+  call <vscale x 1 x half> @llvm.cos.nvx1f16(<vscale x 1 x half> undef)
+  call <vscale x 2 x half> @llvm.cos.nvx2f16(<vscale x 2 x half> undef)
+  call <vscale x 4 x half> @llvm.cos.nvx4f16(<vscale x 4 x half> undef)
+  call <vscale x 8 x half> @llvm.cos.nvx8f16(<vscale x 8 x half> undef)
+  ret void
+}
+
 define void @exp() {
 ; CHECK-LABEL: 'exp'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.exp.f32(float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.exp.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.exp.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.exp.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.exp.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.exp.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.exp.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.exp.f64(double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.exp.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.exp.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.exp.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.exp.v16f64(<16 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.exp.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.exp.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.exp.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.exp.bf16(bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.exp.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.exp.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.exp.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.exp.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.exp.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.exp.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.exp.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.exp.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.exp.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.exp.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.exp.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.exp.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.exp.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.exp.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.exp.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.exp.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.exp.v16f64(<16 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.exp.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.exp.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.exp.nxv8f64(<vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  call bfloat @llvm.exp.bf16(bfloat undef)
+  call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+  call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+  call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+  call <vscale x 1 x bfloat> @llvm.exp.nvx1bf16(<vscale x 1 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.exp.nvx2bf16(<vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.exp.nvx4bf16(<vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.exp.nvx8bf16(<vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.exp.nvx16bf16(<vscale x 16 x bfloat> undef)
   call float @llvm.exp.f32(float undef)
   call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
   call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
@@ -136,29 +271,86 @@ define void @exp() {
   ret void
 }
 
+define void @exp_f16() {
+; ZVFH-LABEL: 'exp_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.exp.f16(half undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.exp.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.exp.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.exp.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.exp.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'exp_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.exp.f16(half undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.exp.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.exp.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.exp.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.exp.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call half @llvm.exp.f16(half undef)
+  call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+  call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+  call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+  call <vscale x 1 x half> @llvm.exp.nvx1f16(<vscale x 1 x half> undef)
+  call <vscale x 2 x half> @llvm.exp.nvx2f16(<vscale x 2 x half> undef)
+  call <vscale x 4 x half> @llvm.exp.nvx4f16(<vscale x 4 x half> undef)
+  call <vscale x 8 x half> @llvm.exp.nvx8f16(<vscale x 8 x half> undef)
+  ret void
+}
+
 define void @exp2() {
 ; CHECK-LABEL: 'exp2'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.exp2.f32(float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.exp2.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.exp2.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.exp2.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.exp2.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.exp2.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.exp2.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.exp2.f64(double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.exp2.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.exp2.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.exp2.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.exp2.v16f64(<16 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.exp2.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.exp2.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.exp2.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.exp2.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.exp2.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.exp2.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.exp2.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.exp2.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.exp2.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.exp2.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.exp2.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.exp2.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.exp2.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.exp2.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.exp2.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.exp2.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.exp2.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.exp2.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.exp2.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.exp2.v16f64(<16 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.exp2.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.exp2.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.exp2.nxv8f64(<vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  call bfloat @llvm.exp2.bf16(bfloat undef)
+  call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+  call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+  call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+  call <vscale x 1 x bfloat> @llvm.exp2.nvx1bf16(<vscale x 1 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.exp2.nvx2bf16(<vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.exp2.nvx4bf16(<vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.exp2.nvx8bf16(<vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.exp2.nvx16bf16(<vscale x 16 x bfloat> undef)
   call float @llvm.exp2.f32(float undef)
   call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
   call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
@@ -181,29 +373,86 @@ define void @exp2() {
   ret void
 }
 
+define void @exp2_f16() {
+; ZVFH-LABEL: 'exp2_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.exp2.f16(half undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.exp2.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.exp2.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.exp2.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.exp2.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'exp2_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.exp2.f16(half undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.exp2.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.exp2.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.exp2.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.exp2.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call half @llvm.exp2.f16(half undef)
+  call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+  call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+  call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+  call <vscale x 1 x half> @llvm.exp2.nvx1f16(<vscale x 1 x half> undef)
+  call <vscale x 2 x half> @llvm.exp2.nvx2f16(<vscale x 2 x half> undef)
+  call <vscale x 4 x half> @llvm.exp2.nvx4f16(<vscale x 4 x half> undef)
+  call <vscale x 8 x half> @llvm.exp2.nvx8f16(<vscale x 8 x half> undef)
+  ret void
+}
+
 define void @log() {
 ; CHECK-LABEL: 'log'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.log.f32(float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.log.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.log.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.log.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.log.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.log.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.log.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.log.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.log.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.log.f64(double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.log.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.log.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.log.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.log.v16f64(<16 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.log.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.log.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.log.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.log.bf16(bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.log.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.log.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.log.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.log.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.log.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.log.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.log.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.log.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.log.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.log.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.log.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.log.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.log.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.log.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.log.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.log.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.log.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.log.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.log.v16f64(<16 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.log.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.log.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.log.nxv8f64(<vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  call bfloat @llvm.log.bf16(bfloat undef)
+  call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+  call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+  call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+  call <vscale x 1 x bfloat> @llvm.log.nvx1bf16(<vscale x 1 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.log.nvx2bf16(<vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.log.nvx4bf16(<vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.log.nvx8bf16(<vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.log.nvx16bf16(<vscale x 16 x bfloat> undef)
   call float @llvm.log.f32(float undef)
   call <2 x float> @llvm.log.v2f32(<2 x float> undef)
   call <4 x float> @llvm.log.v4f32(<4 x float> undef)
@@ -226,29 +475,86 @@ define void @log() {
   ret void
 }
 
+define void @log_f16() {
+; ZVFH-LABEL: 'log_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.log.f16(half undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.log.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.log.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.log.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.log.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'log_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.log.f16(half undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.log.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.log.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.log.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.log.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call half @llvm.log.f16(half undef)
+  call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+  call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+  call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+  call <vscale x 1 x half> @llvm.log.nvx1f16(<vscale x 1 x half> undef)
+  call <vscale x 2 x half> @llvm.log.nvx2f16(<vscale x 2 x half> undef)
+  call <vscale x 4 x half> @llvm.log.nvx4f16(<vscale x 4 x half> undef)
+  call <vscale x 8 x half> @llvm.log.nvx8f16(<vscale x 8 x half> undef)
+  ret void
+}
+
 define void @log10() {
 ; CHECK-LABEL: 'log10'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.log10.f32(float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.log10.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.log10.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.log10.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.log10.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.log10.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.log10.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.log10.f64(double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.log10.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.log10.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.log10.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.log10.v16f64(<16 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.log10.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.log10.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.log10.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.log10.bf16(bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.log10.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.log10.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.log10.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.log10.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.log10.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.log10.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.log10.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.log10.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.log10.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.log10.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.log10.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.log10.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.log10.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.log10.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.log10.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.log10.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.log10.v16f64(<16 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.log10.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.log10.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.log10.nxv8f64(<vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  call bfloat @llvm.log10.bf16(bfloat undef)
+  call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+  call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+  call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+  call <vscale x 1 x bfloat> @llvm.log10.nvx1bf16(<vscale x 1 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.log10.nvx2bf16(<vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.log10.nvx4bf16(<vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.log10.nvx8bf16(<vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.log10.nvx16bf16(<vscale x 16 x bfloat> undef)
   call float @llvm.log10.f32(float undef)
   call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
   call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
@@ -271,29 +577,86 @@ define void @log10() {
   ret void
 }
 
+define void @log10_f16() {
+; ZVFH-LABEL: 'log10_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.log10.f16(half undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.log10.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.log10.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.log10.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.log10.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'log10_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.log10.f16(half undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.log10.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.log10.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.log10.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.log10.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call half @llvm.log10.f16(half undef)
+  call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+  call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+  call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+  call <vscale x 1 x half> @llvm.log10.nvx1f16(<vscale x 1 x half> undef)
+  call <vscale x 2 x half> @llvm.log10.nvx2f16(<vscale x 2 x half> undef)
+  call <vscale x 4 x half> @llvm.log10.nvx4f16(<vscale x 4 x half> undef)
+  call <vscale x 8 x half> @llvm.log10.nvx8f16(<vscale x 8 x half> undef)
+  ret void
+}
+
 define void @log2() {
 ; CHECK-LABEL: 'log2'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.log2.f32(float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.log2.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.log2.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.log2.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.log2.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.log2.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.log2.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.log2.f64(double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.log2.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.log2.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.log2.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.log2.v16f64(<16 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.log2.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.log2.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.log2.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.log2.bf16(bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.log2.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.log2.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.log2.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.log2.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.log2.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.log2.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.log2.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.log2.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.log2.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.log2.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.log2.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.log2.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.log2.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.log2.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.log2.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.log2.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.log2.v16f64(<16 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.log2.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.log2.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.log2.nxv8f64(<vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  call bfloat @llvm.log2.bf16(bfloat undef)
+  call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+  call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+  call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+  call <vscale x 1 x bfloat> @llvm.log2.nvx1bf16(<vscale x 1 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.log2.nvx2bf16(<vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.log2.nvx4bf16(<vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.log2.nvx8bf16(<vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.log2.nvx16bf16(<vscale x 16 x bfloat> undef)
   call float @llvm.log2.f32(float undef)
   call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
   call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
@@ -316,142 +679,40 @@ define void @log2() {
   ret void
 }
 
-declare float @llvm.sin.f32(float)
-declare <2 x float> @llvm.sin.v2f32(<2 x float>)
-declare <4 x float> @llvm.sin.v4f32(<4 x float>)
-declare <8 x float> @llvm.sin.v8f32(<8 x float>)
-declare <16 x float> @llvm.sin.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.sin.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.sin.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.sin.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.sin.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.sin.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.sin.f64(double)
-declare <2 x double> @llvm.sin.v2f64(<2 x double>)
-declare <4 x double> @llvm.sin.v4f64(<4 x double>)
-declare <8 x double> @llvm.sin.v8f64(<8 x double>)
-declare <16 x double> @llvm.sin.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.sin.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.sin.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.sin.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.sin.nvx8f64(<vscale x 8 x double>)
-
-declare float @llvm.cos.f32(float)
-declare <2 x float> @llvm.cos.v2f32(<2 x float>)
-declare <4 x float> @llvm.cos.v4f32(<4 x float>)
-declare <8 x float> @llvm.cos.v8f32(<8 x float>)
-declare <16 x float> @llvm.cos.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.cos.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.cos.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.cos.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.cos.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.cos.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.cos.f64(double)
-declare <2 x double> @llvm.cos.v2f64(<2 x double>)
-declare <4 x double> @llvm.cos.v4f64(<4 x double>)
-declare <8 x double> @llvm.cos.v8f64(<8 x double>)
-declare <16 x double> @llvm.cos.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.cos.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.cos.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.cos.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.cos.nvx8f64(<vscale x 8 x double>)
-
-declare float @llvm.exp.f32(float)
-declare <2 x float> @llvm.exp.v2f32(<2 x float>)
-declare <4 x float> @llvm.exp.v4f32(<4 x float>)
-declare <8 x float> @llvm.exp.v8f32(<8 x float>)
-declare <16 x float> @llvm.exp.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.exp.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.exp.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.exp.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.exp.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.exp.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.exp.f64(double)
-declare <2 x double> @llvm.exp.v2f64(<2 x double>)
-declare <4 x double> @llvm.exp.v4f64(<4 x double>)
-declare <8 x double> @llvm.exp.v8f64(<8 x double>)
-declare <16 x double> @llvm.exp.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.exp.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.exp.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.exp.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.exp.nvx8f64(<vscale x 8 x double>)
-
-declare float @llvm.exp2.f32(float)
-declare <2 x float> @llvm.exp2.v2f32(<2 x float>)
-declare <4 x float> @llvm.exp2.v4f32(<4 x float>)
-declare <8 x float> @llvm.exp2.v8f32(<8 x float>)
-declare <16 x float> @llvm.exp2.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.exp2.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.exp2.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.exp2.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.exp2.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.exp2.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.exp2.f64(double)
-declare <2 x double> @llvm.exp2.v2f64(<2 x double>)
-declare <4 x double> @llvm.exp2.v4f64(<4 x double>)
-declare <8 x double> @llvm.exp2.v8f64(<8 x double>)
-declare <16 x double> @llvm.exp2.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.exp2.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.exp2.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.exp2.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.exp2.nvx8f64(<vscale x 8 x double>)
-
-declare float @llvm.log.f32(float)
-declare <2 x float> @llvm.log.v2f32(<2 x float>)
-declare <4 x float> @llvm.log.v4f32(<4 x float>)
-declare <8 x float> @llvm.log.v8f32(<8 x float>)
-declare <16 x float> @llvm.log.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.log.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.log.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.log.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.log.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.log.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.log.f64(double)
-declare <2 x double> @llvm.log.v2f64(<2 x double>)
-declare <4 x double> @llvm.log.v4f64(<4 x double>)
-declare <8 x double> @llvm.log.v8f64(<8 x double>)
-declare <16 x double> @llvm.log.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.log.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.log.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.log.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.log.nvx8f64(<vscale x 8 x double>)
-
-declare float @llvm.log10.f32(float)
-declare <2 x float> @llvm.log10.v2f32(<2 x float>)
-declare <4 x float> @llvm.log10.v4f32(<4 x float>)
-declare <8 x float> @llvm.log10.v8f32(<8 x float>)
-declare <16 x float> @llvm.log10.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.log10.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.log10.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.log10.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.log10.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.log10.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.log10.f64(double)
-declare <2 x double> @llvm.log10.v2f64(<2 x double>)
-declare <4 x double> @llvm.log10.v4f64(<4 x double>)
-declare <8 x double> @llvm.log10.v8f64(<8 x double>)
-declare <16 x double> @llvm.log10.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.log10.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.log10.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.log10.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.log10.nvx8f64(<vscale x 8 x double>)
+define void @log2_f16() {
+; ZVFH-LABEL: 'log2_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.log2.f16(half undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.log2.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.log2.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.log2.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.log2.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'log2_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.log2.f16(half undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.log2.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.log2.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.log2.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.log2.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call half @llvm.log2.f16(half undef)
+  call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+  call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+  call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+  call <vscale x 1 x half> @llvm.log2.nvx1f16(<vscale x 1 x half> undef)
+  call <vscale x 2 x half> @llvm.log2.nvx2f16(<vscale x 2 x half> undef)
+  call <vscale x 4 x half> @llvm.log2.nvx4f16(<vscale x 4 x half> undef)
+  call <vscale x 8 x half> @llvm.log2.nvx8f16(<vscale x 8 x half> undef)
+  ret void
+}
 
-declare float @llvm.log2.f32(float)
-declare <2 x float> @llvm.log2.v2f32(<2 x float>)
-declare <4 x float> @llvm.log2.v4f32(<4 x float>)
-declare <8 x float> @llvm.log2.v8f32(<8 x float>)
-declare <16 x float> @llvm.log2.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.log2.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.log2.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.log2.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.log2.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.log2.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.log2.f64(double)
-declare <2 x double> @llvm.log2.v2f64(<2 x double>)
-declare <4 x double> @llvm.log2.v4f64(<4 x double>)
-declare <8 x double> @llvm.log2.v8f64(<8 x double>)
-declare <16 x double> @llvm.log2.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.log2.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.log2.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.log2.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.log2.nvx8f64(<vscale x 8 x double>)

>From b5cc222d7429fe6f18c787f633d5262fac2e676f Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Wed, 16 Oct 2024 14:52:25 +0530
Subject: [PATCH 35/82] [MIR] Fix vreg flag vector memory leak (#112479)

A fix-it patch for dbfca24 #110228.

No need for a container. This allows 8 flags for a register.

The virtual register flags vector had a memory leak because the vector's
memory is not freed.
The `BumpPtrAllocator` handles the deallocation and missed calling the
`std::vector<uint8_t> Flags` destructor.
---
 llvm/include/llvm/CodeGen/MIRParser/MIParser.h | 2 +-
 llvm/lib/CodeGen/MIRParser/MIRParser.cpp       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
index 4d93213de5e070..0f2898d3554d06 100644
--- a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
+++ b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
@@ -45,7 +45,7 @@ struct VRegInfo {
   } D;
   Register VReg;
   Register PreferredReg;
-  std::vector<uint8_t> Flags;
+  uint8_t Flags = 0;
 };
 
 using Name2RegClassMap = StringMap<const TargetRegisterClass *>;
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 10d3cdcf0c1ce1..c0c61b3fdd1677 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -703,7 +703,7 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS,
         return error(FlagStringValue.SourceRange.Start,
                      Twine("use of undefined register flag '") +
                          FlagStringValue.Value + "'");
-      Info.Flags.push_back(FlagValue);
+      Info.Flags |= FlagValue;
     }
     RegInfo.noteNewVirtualRegister(Info.VReg);
   }

>From 15d85769f119061fbfcae6e9de43982b534ef724 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof at amd.com>
Date: Wed, 16 Oct 2024 10:27:50 +0100
Subject: [PATCH 36/82] [Flang][OpenMP] Support lowering of simd reductions
 (#112194)

This patch enables lowering to MLIR of the reduction clause of `simd`
constructs. Lowering from MLIR to LLVM IR remains unimplemented, so at
that stage it will result in errors being emitted rather than silently
ignoring it as it is currently done.

On composite `do simd` constructs, this lowering error will remain
untriggered, as the `omp.simd` operation in that case is currently
ignored. The MLIR representation, however, will now contain `reduction`
information.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             | 16 +++++++++----
 flang/test/Lower/OpenMP/simd.f90              | 24 +++++++++++++++++++
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |  8 ++++---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 21 ++++++++++++----
 4 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 70d89f5e521a59..cf469003b7298d 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2070,7 +2070,9 @@ static void genStandaloneSimd(lower::AbstractConverter &converter,
                      loopNestClauseOps, iv);
 
   EntryBlockArgs simdArgs;
-  // TODO: Add private, reduction syms and vars.
+  // TODO: Add private syms and vars.
+  simdArgs.reduction.syms = simdReductionSyms;
+  simdArgs.reduction.vars = simdClauseOps.reductionVars;
   auto simdOp =
       genWrapperOp<mlir::omp::SimdOp>(converter, loc, simdClauseOps, simdArgs);
 
@@ -2228,7 +2230,9 @@ static void genCompositeDistributeParallelDoSimd(
   wsloopOp.setComposite(/*val=*/true);
 
   EntryBlockArgs simdArgs;
-  // TODO: Add private, reduction syms and vars.
+  // TODO: Add private syms and vars.
+  simdArgs.reduction.syms = simdReductionSyms;
+  simdArgs.reduction.vars = simdClauseOps.reductionVars;
   auto simdOp =
       genWrapperOp<mlir::omp::SimdOp>(converter, loc, simdClauseOps, simdArgs);
   simdOp.setComposite(/*val=*/true);
@@ -2285,7 +2289,9 @@ static void genCompositeDistributeSimd(lower::AbstractConverter &converter,
   distributeOp.setComposite(/*val=*/true);
 
   EntryBlockArgs simdArgs;
-  // TODO: Add private, reduction syms and vars.
+  // TODO: Add private syms and vars.
+  simdArgs.reduction.syms = simdReductionSyms;
+  simdArgs.reduction.vars = simdClauseOps.reductionVars;
   auto simdOp =
       genWrapperOp<mlir::omp::SimdOp>(converter, loc, simdClauseOps, simdArgs);
   simdOp.setComposite(/*val=*/true);
@@ -2342,7 +2348,9 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter,
   wsloopOp.setComposite(/*val=*/true);
 
   EntryBlockArgs simdArgs;
-  // TODO: Add private, reduction syms and vars.
+  // TODO: Add private syms and vars.
+  simdArgs.reduction.syms = simdReductionSyms;
+  simdArgs.reduction.vars = simdClauseOps.reductionVars;
   auto simdOp =
       genWrapperOp<mlir::omp::SimdOp>(converter, loc, simdClauseOps, simdArgs);
   simdOp.setComposite(/*val=*/true);
diff --git a/flang/test/Lower/OpenMP/simd.f90 b/flang/test/Lower/OpenMP/simd.f90
index f574a1265e06c4..d92f06cebfdbe5 100644
--- a/flang/test/Lower/OpenMP/simd.f90
+++ b/flang/test/Lower/OpenMP/simd.f90
@@ -4,6 +4,8 @@
 ! RUN: %flang_fc1 -flang-experimental-hlfir -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
 ! RUN: bbc -hlfir -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
 
+!CHECK: omp.declare_reduction @[[REDUCER:.*]] : i32
+
 !CHECK-LABEL: func @_QPsimd()
 subroutine simd
   integer :: i
@@ -273,3 +275,25 @@ subroutine lastprivate_with_simd
     sum = i + 1
   end do
 end subroutine
+
+!CHECK-LABEL: func @_QPsimd_with_reduction_clause()
+subroutine simd_with_reduction_clause
+  integer :: i, x
+  x = 0
+  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
+  ! CHECK-NEXT: %[[UB:.*]] = arith.constant 9 : i32
+  ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
+  ! CHECK-NEXT: omp.simd reduction(@[[REDUCER]] %[[X:.*]]#0 -> %[[X_RED:.*]] : !fir.ref<i32>) {
+  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+  !$omp simd reduction(+:x)
+  do i=1, 9
+    ! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_RED]] {uniq_name = "_QFsimd_with_reduction_clauseEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
+    ! CHECK: %[[X_LD:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+    ! CHECK: %[[I_LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref<i32>
+    ! CHECK: %[[SUM:.*]] = arith.addi %[[X_LD]], %[[I_LD]] : i32
+    ! CHECK: hlfir.assign %[[SUM]] to %[[X_DECL]]#0 : i32, !fir.ref<i32>
+    x = x+i
+  end do
+  !$OMP end simd
+end subroutine
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index c6c6edb8f999fc..3217542e105607 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -2012,14 +2012,16 @@ void SimdOp::build(OpBuilder &builder, OperationState &state,
                    const SimdOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
   // TODO Store clauses in op: linearVars, linearStepVars, privateVars,
-  // privateSyms, reductionVars, reductionByref, reductionSyms.
+  // privateSyms.
   SimdOp::build(builder, state, clauses.alignedVars,
                 makeArrayAttr(ctx, clauses.alignments), clauses.ifExpr,
                 /*linear_vars=*/{}, /*linear_step_vars=*/{},
                 clauses.nontemporalVars, clauses.order, clauses.orderMod,
                 /*private_vars=*/{}, /*private_syms=*/nullptr,
-                /*reduction_vars=*/{}, /*reduction_byref=*/nullptr,
-                /*reduction_syms=*/nullptr, clauses.safelen, clauses.simdlen);
+                clauses.reductionVars,
+                makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+                makeArrayAttr(ctx, clauses.reductionSyms), clauses.safelen,
+                clauses.simdlen);
 }
 
 LogicalResult SimdOp::verify() {
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 4a575f4e577062..cb7dd3cd874d9f 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1785,6 +1785,20 @@ convertOrderKind(std::optional<omp::ClauseOrderKind> o) {
   llvm_unreachable("Unknown ClauseOrderKind kind");
 }
 
+static LogicalResult simdOpSupported(omp::SimdOp op) {
+  if (!op.getLinearVars().empty() || !op.getLinearStepVars().empty())
+    return op.emitError("linear clause not yet supported");
+
+  if (!op.getPrivateVars().empty() || op.getPrivateSyms())
+    return op.emitError("privatization clauses not yet supported");
+
+  if (!op.getReductionVars().empty() || op.getReductionByref() ||
+      op.getReductionSyms())
+    return op.emitError("reduction clause not yet supported");
+
+  return success();
+}
+
 /// Converts an OpenMP simd loop into LLVM IR using OpenMPIRBuilder.
 static LogicalResult
 convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
@@ -1792,11 +1806,8 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
   auto simdOp = cast<omp::SimdOp>(opInst);
   auto loopOp = cast<omp::LoopNestOp>(simdOp.getWrappedLoop());
 
-  if (!simdOp.getLinearVars().empty() || !simdOp.getLinearStepVars().empty() ||
-      !simdOp.getPrivateVars().empty() || simdOp.getPrivateSyms() ||
-      !simdOp.getReductionVars().empty() || simdOp.getReductionByref() ||
-      simdOp.getReductionSyms())
-    return opInst.emitError("unhandled clauses for translation to LLVM IR");
+  if (failed(simdOpSupported(simdOp)))
+    return failure();
 
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
 

>From 70334081f75d67900c6ffa193c60c4d6f4767354 Mon Sep 17 00:00:00 2001
From: Simon Camphausen <simon.camphausen at iml.fraunhofer.de>
Date: Wed, 16 Oct 2024 11:49:49 +0200
Subject: [PATCH 37/82] [mlir][bufferization] Expose buffer alignment as a pass
 option in one-shot-bufferize (#112505)

---
 mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td | 2 ++
 mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp      | 1 +
 2 files changed, 3 insertions(+)

diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
index a683a905cd2d6b..cc5463ea968fc3 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -536,6 +536,8 @@ def OneShotBufferize : Pass<"one-shot-bufferize", "ModuleOp"> {
     Option<"unknownTypeConversion", "unknown-type-conversion", "std::string",
            /*default=*/"\"fully-dynamic-layout-map\"",
            "Controls layout maps for non-inferrable memref types.">,
+    Option<"bufferAlignment", "buffer-alignment", "uint64_t", /*default=*/"64",
+           "Sets the alignment of newly allocated buffers.">,
   ];
   let constructor = "mlir::bufferization::createOneShotBufferizePass()";
 
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index 875d8c40e92cc1..1d009b03754c52 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -224,6 +224,7 @@ struct OneShotBufferizePass
         };
       }
       opt.printConflicts = printConflicts;
+      opt.bufferAlignment = bufferAlignment;
       opt.testAnalysisOnly = testAnalysisOnly;
       opt.bufferizeFunctionBoundaries = bufferizeFunctionBoundaries;
       opt.checkParallelRegions = checkParallelRegions;

>From 467a9bde06e681cecc69afa18580aadf2ed9769b Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <youngsuk.kim at hpe.com>
Date: Wed, 16 Oct 2024 06:14:38 -0400
Subject: [PATCH 38/82] [polly] Avoid llvm::Type::getPointerTo() (NFC)
 (#112368)

`llvm::Type::getPointerTo()` is to be deprecated & removed soon.

Also, avoid pointercasts that are essentially no-ops.
---
 polly/lib/CodeGen/BlockGenerators.cpp    | 6 ------
 polly/lib/CodeGen/IslNodeBuilder.cpp     | 2 --
 polly/lib/CodeGen/LoopGeneratorsGOMP.cpp | 2 +-
 3 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/polly/lib/CodeGen/BlockGenerators.cpp b/polly/lib/CodeGen/BlockGenerators.cpp
index c58763603cfa22..b76d8f4c18a551 100644
--- a/polly/lib/CodeGen/BlockGenerators.cpp
+++ b/polly/lib/CodeGen/BlockGenerators.cpp
@@ -786,12 +786,6 @@ void BlockGenerator::generateScalarStores(
                                Builder.GetInsertBlock())) &&
                  "Domination violation");
 
-          // The new Val might have a different type than the old Val due to
-          // ScalarEvolution looking through bitcasts.
-          Address = Builder.CreateBitOrPointerCast(
-              Address, Val->getType()->getPointerTo(
-                           Address->getType()->getPointerAddressSpace()));
-
           Builder.CreateStore(Val, Address);
         });
   }
diff --git a/polly/lib/CodeGen/IslNodeBuilder.cpp b/polly/lib/CodeGen/IslNodeBuilder.cpp
index 3f07f02038a1ec..d76f6251ea4ced 100644
--- a/polly/lib/CodeGen/IslNodeBuilder.cpp
+++ b/polly/lib/CodeGen/IslNodeBuilder.cpp
@@ -1050,8 +1050,6 @@ Value *IslNodeBuilder::preloadUnconditionally(__isl_take isl_set *AccessRange,
 
   auto *Ptr = AddressValue;
   auto Name = Ptr->getName();
-  auto AS = Ptr->getType()->getPointerAddressSpace();
-  Ptr = Builder.CreatePointerCast(Ptr, Ty->getPointerTo(AS), Name + ".cast");
   PreloadVal = Builder.CreateLoad(Ty, Ptr, Name + ".load");
   if (LoadInst *PreloadInst = dyn_cast<LoadInst>(PreloadVal))
     PreloadInst->setAlignment(cast<LoadInst>(AccInst)->getAlign());
diff --git a/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp b/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp
index cd440b28202e6d..b98416a92097f2 100644
--- a/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp
+++ b/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp
@@ -183,7 +183,7 @@ Value *ParallelLoopGeneratorGOMP::createCallGetWorkItem(Value *LBPtr,
   // If F is not available, declare it.
   if (!F) {
     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    Type *Params[] = {LongType->getPointerTo(), LongType->getPointerTo()};
+    Type *Params[] = {Builder.getPtrTy(0), Builder.getPtrTy(0)};
     FunctionType *Ty = FunctionType::get(Builder.getInt8Ty(), Params, false);
     F = Function::Create(Ty, Linkage, Name, M);
   }

>From 25b702f2637d5520cacf59e6a92e52956ccc7e8d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 16 Oct 2024 11:13:16 +0100
Subject: [PATCH 39/82] [DAG] visitXOR - add missing comment for or/and
 constant setcc demorgan fold. NFC.

Noticed while triaging #112347 which is using this fold - we described the or->and fold, but not the equivalent and->or which is also handled.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 608ee8558e9928..0879165aac139d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9594,6 +9594,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   }
 
   // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
+  // fold (not (and x, y)) -> (or (not x), (not y)) iff x or y are setcc
   if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() &&
       (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) {
     SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1);

>From 9ee9e0e3b2323e7cca00a5223ace5e25e7ed1c1f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 16 Oct 2024 11:15:29 +0100
Subject: [PATCH 40/82] [X86] Extend ANDNOT fold tests to cover all legal
 scalars and 256-bit vectors

Add tests to check what happens on i8/i16/i32 scalars (ANDN only has i32/i64 variants)
---
 llvm/test/CodeGen/X86/pr108731.ll | 127 ++++++++++++++++++++++++++++--
 1 file changed, 121 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/X86/pr108731.ll b/llvm/test/CodeGen/X86/pr108731.ll
index 87dce0314d08e2..473b4f7f4da2e3 100644
--- a/llvm/test/CodeGen/X86/pr108731.ll
+++ b/llvm/test/CodeGen/X86/pr108731.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,NOBMI
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,BMI
 
-define i64 @foo(i64 %w, i64 %x, i64 %y, i64 %z) {
-; NOBMI-LABEL: foo:
+define i64 @test_i64(i64 %w, i64 %x, i64 %y, i64 %z) {
+; NOBMI-LABEL: test_i64:
 ; NOBMI:       # %bb.0: # %Entry
 ; NOBMI-NEXT:    movq %rcx, %rax
 ; NOBMI-NEXT:    andq %rdx, %rsi
@@ -14,7 +14,7 @@ define i64 @foo(i64 %w, i64 %x, i64 %y, i64 %z) {
 ; NOBMI-NEXT:    andq %rsi, %rax
 ; NOBMI-NEXT:    retq
 ;
-; BMI-LABEL: foo:
+; BMI-LABEL: test_i64:
 ; BMI:       # %bb.0: # %Entry
 ; BMI-NEXT:    andq %rdx, %rsi
 ; BMI-NEXT:    andnq %rdi, %rsi, %rax
@@ -31,8 +31,91 @@ Entry:
   ret i64 %and3
 }
 
-define <16 x i8> @fooVec(<16 x i8> %w, <16 x i8> %x, <16 x i8> %y, <16 x i8> %z) {
-; NOBMI-LABEL: fooVec:
+define i32 @test_i32(i32 %w, i32 %x, i32 %y, i32 %z) {
+; NOBMI-LABEL: test_i32:
+; NOBMI:       # %bb.0: # %Entry
+; NOBMI-NEXT:    movl %ecx, %eax
+; NOBMI-NEXT:    andl %edx, %esi
+; NOBMI-NEXT:    notl %esi
+; NOBMI-NEXT:    andl %edi, %esi
+; NOBMI-NEXT:    notl %eax
+; NOBMI-NEXT:    orl %edx, %eax
+; NOBMI-NEXT:    andl %esi, %eax
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: test_i32:
+; BMI:       # %bb.0: # %Entry
+; BMI-NEXT:    andl %edx, %esi
+; BMI-NEXT:    andnl %edi, %esi, %eax
+; BMI-NEXT:    andnl %ecx, %edx, %ecx
+; BMI-NEXT:    andnl %eax, %ecx, %eax
+; BMI-NEXT:    retq
+Entry:
+  %and1 = and i32 %y, %x
+  %xor1 = xor i32 %and1, -1
+  %and2 = and i32 %xor1, %w
+  %.not = xor i32 %z, -1
+  %or1 = or i32 %.not, %y
+  %and3 = and i32 %and2, %or1
+  ret i32 %and3
+}
+
+define i16 @test_i16(i16 %w, i16 %x, i16 %y, i16 %z) {
+; NOBMI-LABEL: test_i16:
+; NOBMI:       # %bb.0: # %Entry
+; NOBMI-NEXT:    movl %ecx, %eax
+; NOBMI-NEXT:    andl %edx, %esi
+; NOBMI-NEXT:    notl %esi
+; NOBMI-NEXT:    andl %edi, %esi
+; NOBMI-NEXT:    notl %eax
+; NOBMI-NEXT:    orl %edx, %eax
+; NOBMI-NEXT:    andl %esi, %eax
+; NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: test_i16:
+; BMI:       # %bb.0: # %Entry
+; BMI-NEXT:    andl %edx, %esi
+; BMI-NEXT:    andnl %edi, %esi, %eax
+; BMI-NEXT:    notl %ecx
+; BMI-NEXT:    orl %edx, %ecx
+; BMI-NEXT:    andl %ecx, %eax
+; BMI-NEXT:    # kill: def $ax killed $ax killed $eax
+; BMI-NEXT:    retq
+Entry:
+  %and1 = and i16 %y, %x
+  %xor1 = xor i16 %and1, -1
+  %and2 = and i16 %xor1, %w
+  %.not = xor i16 %z, -1
+  %or1 = or i16 %.not, %y
+  %and3 = and i16 %and2, %or1
+  ret i16 %and3
+}
+
+define i8 @test_i8(i8 %w, i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: test_i8:
+; CHECK:       # %bb.0: # %Entry
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    andl %edx, %esi
+; CHECK-NEXT:    notb %sil
+; CHECK-NEXT:    andb %dil, %sil
+; CHECK-NEXT:    notb %cl
+; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    andb %sil, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+Entry:
+  %and1 = and i8 %y, %x
+  %xor1 = xor i8 %and1, -1
+  %and2 = and i8 %xor1, %w
+  %.not = xor i8 %z, -1
+  %or1 = or i8 %.not, %y
+  %and3 = and i8 %and2, %or1
+  ret i8 %and3
+}
+
+define <16 x i8> @test_v16i8(<16 x i8> %w, <16 x i8> %x, <16 x i8> %y, <16 x i8> %z) {
+; NOBMI-LABEL: test_v16i8:
 ; NOBMI:       # %bb.0: # %Entry
 ; NOBMI-NEXT:    andps %xmm2, %xmm1
 ; NOBMI-NEXT:    andnps %xmm0, %xmm1
@@ -41,7 +124,7 @@ define <16 x i8> @fooVec(<16 x i8> %w, <16 x i8> %x, <16 x i8> %y, <16 x i8> %z)
 ; NOBMI-NEXT:    movaps %xmm2, %xmm0
 ; NOBMI-NEXT:    retq
 ;
-; BMI-LABEL: fooVec:
+; BMI-LABEL: test_v16i8:
 ; BMI:       # %bb.0: # %Entry
 ; BMI-NEXT:    vandps %xmm1, %xmm2, %xmm1
 ; BMI-NEXT:    vandnps %xmm0, %xmm1, %xmm0
@@ -58,6 +141,38 @@ Entry:
   ret <16 x i8> %and3
 }
 
+define <32 x i8> @test_v32i8(<32 x i8> %w, <32 x i8> %x, <32 x i8> %y, <32 x i8> %z) {
+; NOBMI-LABEL: test_v32i8:
+; NOBMI:       # %bb.0: # %Entry
+; NOBMI-NEXT:    andps %xmm4, %xmm2
+; NOBMI-NEXT:    andps %xmm5, %xmm3
+; NOBMI-NEXT:    andnps %xmm1, %xmm3
+; NOBMI-NEXT:    andnps %xmm0, %xmm2
+; NOBMI-NEXT:    andnps %xmm6, %xmm4
+; NOBMI-NEXT:    andnps %xmm2, %xmm4
+; NOBMI-NEXT:    andnps %xmm7, %xmm5
+; NOBMI-NEXT:    andnps %xmm3, %xmm5
+; NOBMI-NEXT:    movaps %xmm4, %xmm0
+; NOBMI-NEXT:    movaps %xmm5, %xmm1
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: test_v32i8:
+; BMI:       # %bb.0: # %Entry
+; BMI-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; BMI-NEXT:    vandnps %ymm0, %ymm1, %ymm0
+; BMI-NEXT:    vandnps %ymm3, %ymm2, %ymm1
+; BMI-NEXT:    vandnps %ymm0, %ymm1, %ymm0
+; BMI-NEXT:    retq
+Entry:
+  %and1 = and <32 x i8> %y, %x
+  %xor1 = xor <32 x i8> %and1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %and2 = and <32 x i8> %xor1, %w
+  %.not = xor <32 x i8> %z, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %or1 = or <32 x i8> %.not, %y
+  %and3 = and <32 x i8> %and2, %or1
+  ret <32 x i8> %and3
+}
+
 ; PR112347 - don't fold if we'd be inverting a constant, as demorgan normalisation will invert it back again.
 define void @PR112347(ptr %p0, ptr %p1, ptr %p2) {
 ; CHECK-LABEL: PR112347:

>From d3d2d72549e03403317ce325c17ad7d516643e2b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 16 Oct 2024 11:52:58 +0100
Subject: [PATCH 41/82] [DAG] SDPatternMatch - add missing BSWAP/CTPOP/CTTZ
 matchers

---
 llvm/include/llvm/CodeGen/SDPatternMatch.h        | 13 +++++++++++++
 .../CodeGen/SelectionDAGPatternMatchTest.cpp      | 15 +++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 0af4f73b869c3c..f0e12f28c9e6d1 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -822,6 +822,11 @@ inline UnaryOpc_match<Opnd, true> m_ChainedUnaryOp(unsigned Opc,
   return UnaryOpc_match<Opnd, true>(Opc, Op);
 }
 
+template <typename Opnd>
+inline UnaryOpc_match<Opnd> m_BSwap(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::BSWAP, Op);
+}
+
 template <typename Opnd>
 inline UnaryOpc_match<Opnd> m_BitReverse(const Opnd &Op) {
   return UnaryOpc_match<Opnd>(ISD::BITREVERSE, Op);
@@ -892,10 +897,18 @@ template <typename Opnd> inline UnaryOpc_match<Opnd> m_FPToSI(const Opnd &Op) {
   return UnaryOpc_match<Opnd>(ISD::FP_TO_SINT, Op);
 }
 
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_Ctpop(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::CTPOP, Op);
+}
+
 template <typename Opnd> inline UnaryOpc_match<Opnd> m_Ctlz(const Opnd &Op) {
   return UnaryOpc_match<Opnd>(ISD::CTLZ, Op);
 }
 
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_Cttz(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::CTTZ, Op);
+}
+
 // === Constants ===
 struct ConstantInt_match {
   APInt *BindVal;
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 7400b6c1984f72..a28e2b2b47dfa9 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -302,7 +302,12 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   SDValue FPToSI = DAG->getNode(ISD::FP_TO_SINT, DL, FloatVT, Op2);
   SDValue FPToUI = DAG->getNode(ISD::FP_TO_UINT, DL, FloatVT, Op2);
 
+  SDValue Brev = DAG->getNode(ISD::BITREVERSE, DL, Int32VT, Op0);
+  SDValue Bswap = DAG->getNode(ISD::BSWAP, DL, Int32VT, Op0);
+
+  SDValue Ctpop = DAG->getNode(ISD::CTPOP, DL, Int32VT, Op0);
   SDValue Ctlz = DAG->getNode(ISD::CTLZ, DL, Int32VT, Op0);
+  SDValue Cttz = DAG->getNode(ISD::CTTZ, DL, Int32VT, Op0);
 
   using namespace SDPatternMatch;
   EXPECT_TRUE(sd_match(ZExt, m_UnaryOp(ISD::ZERO_EXTEND, m_Value())));
@@ -328,7 +333,17 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   EXPECT_FALSE(sd_match(FPToUI, m_FPToSI(m_Value())));
   EXPECT_FALSE(sd_match(FPToSI, m_FPToUI(m_Value())));
 
+  EXPECT_TRUE(sd_match(Brev, m_BitReverse(m_Value())));
+  EXPECT_TRUE(sd_match(Bswap, m_BSwap(m_Value())));
+  EXPECT_FALSE(sd_match(Brev, m_BSwap(m_Value())));
+  EXPECT_FALSE(sd_match(Bswap, m_BitReverse(m_Value())));
+
+  EXPECT_TRUE(sd_match(Ctpop, m_Ctpop(m_Value())));
   EXPECT_TRUE(sd_match(Ctlz, m_Ctlz(m_Value())));
+  EXPECT_TRUE(sd_match(Cttz, m_Cttz(m_Value())));
+  EXPECT_FALSE(sd_match(Ctpop, m_Ctlz(m_Value())));
+  EXPECT_FALSE(sd_match(Ctlz, m_Cttz(m_Value())));
+  EXPECT_FALSE(sd_match(Cttz, m_Ctlz(m_Value())));
 }
 
 TEST_F(SelectionDAGPatternMatchTest, matchConstants) {

>From 49fa91edf7704dc385d3a97ddb74b7348be10bc7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 16 Oct 2024 11:57:18 +0100
Subject: [PATCH 42/82] [DAG] SDPatternMatch - add missing ROTL/ROTR matchers

---
 llvm/include/llvm/CodeGen/SDPatternMatch.h             | 10 ++++++++++
 .../unittests/CodeGen/SelectionDAGPatternMatchTest.cpp |  7 +++++++
 2 files changed, 17 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index f0e12f28c9e6d1..b3e249b7ebd5c4 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -759,6 +759,16 @@ inline BinaryOpc_match<LHS, RHS> m_Srl(const LHS &L, const RHS &R) {
   return BinaryOpc_match<LHS, RHS>(ISD::SRL, L, R);
 }
 
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS> m_Rotl(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS>(ISD::ROTL, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS> m_Rotr(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS>(ISD::ROTR, L, R);
+}
+
 template <typename LHS, typename RHS>
 inline BinaryOpc_match<LHS, RHS, true> m_FAdd(const LHS &L, const RHS &R) {
   return BinaryOpc_match<LHS, RHS, true>(ISD::FADD, L, R);
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index a28e2b2b47dfa9..dc40e5893b65e2 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -200,6 +200,8 @@ TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
   SDValue SMin = DAG->getNode(ISD::SMIN, DL, Int32VT, Op1, Op0);
   SDValue UMax = DAG->getNode(ISD::UMAX, DL, Int32VT, Op0, Op1);
   SDValue UMin = DAG->getNode(ISD::UMIN, DL, Int32VT, Op1, Op0);
+  SDValue Rotl = DAG->getNode(ISD::ROTL, DL, Int32VT, Op0, Op1);
+  SDValue Rotr = DAG->getNode(ISD::ROTR, DL, Int32VT, Op1, Op0);
 
   SDValue ICMP_GT = DAG->getSetCC(DL, MVT::i1, Op0, Op1, ISD::SETGT);
   SDValue ICMP_GE = DAG->getSetCC(DL, MVT::i1, Op0, Op1, ISD::SETGE);
@@ -246,6 +248,11 @@ TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
   EXPECT_FALSE(sd_match(DisOr, m_Add(m_Value(), m_Value())));
   EXPECT_TRUE(sd_match(DisOr, m_AddLike(m_Value(), m_Value())));
 
+  EXPECT_TRUE(sd_match(Rotl, m_Rotl(m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(Rotr, m_Rotr(m_Value(), m_Value())));
+  EXPECT_FALSE(sd_match(Rotl, m_Rotr(m_Value(), m_Value())));
+  EXPECT_FALSE(sd_match(Rotr, m_Rotl(m_Value(), m_Value())));
+
   EXPECT_TRUE(sd_match(SMax, m_c_BinOp(ISD::SMAX, m_Value(), m_Value())));
   EXPECT_TRUE(sd_match(SMax, m_SMax(m_Value(), m_Value())));
   EXPECT_TRUE(sd_match(SMax, m_SMaxLike(m_Value(), m_Value())));

>From 3ef630ac339f31686290f9460a40eb2a9c9f5bd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz at gmail.com>
Date: Wed, 16 Oct 2024 13:07:02 +0200
Subject: [PATCH 43/82] [lldb] Support tests with nested make invocations on
 Windows 2/2 (#112360)

Following up from https://github.com/llvm/llvm-project/pull/112342, we
roll out the fix and quote nested `make` invocations in all API tests.
---
 .../commands/expression/top-level/Makefile    |  2 +-
 .../commands/expression/weak_symbols/Makefile |  4 ++--
 .../API/commands/target/create-deps/Makefile  |  2 +-
 .../breakpoint/break_in_loaded_dylib/Makefile |  2 +-
 .../dlopen_other_executable/Makefile          |  2 +-
 lldb/test/API/functionalities/exec/Makefile   |  2 +-
 .../functionalities/jitloader_gdb/Makefile    |  2 +-
 .../functionalities/limit-debug-info/Makefile |  4 ++--
 .../load_after_attach/Makefile                |  2 +-
 .../API/functionalities/load_lazy/Makefile    |  6 +++---
 .../API/functionalities/load_unload/Makefile  | 10 +++++-----
 .../functionalities/load_using_paths/Makefile |  2 +-
 .../functionalities/scripted_process/Makefile |  2 +-
 .../stop-on-sharedlibrary-load/Makefile       |  4 ++--
 .../tail_call_frames/cross_dso/Makefile       |  2 +-
 .../target-new-solib-notifications/Makefile   | 20 +++++++++----------
 .../API/lang/c/conflicting-symbol/Makefile    |  2 +-
 .../API/lang/cpp/incomplete-types/Makefile    |  2 +-
 .../lang/cpp/namespace_definitions/Makefile   |  4 ++--
 .../lang/objc/conflicting-definition/Makefile |  4 ++--
 .../lang/objc/modules-hash-mismatch/Makefile  |  2 +-
 .../API/macosx/delay-init-dependency/Makefile |  2 +-
 .../API/macosx/expedited-thread-pcs/Makefile  |  2 +-
 lldb/test/API/macosx/indirect_symbol/Makefile |  4 ++--
 .../API/macosx/lc-note/kern-ver-str/Makefile  |  2 +-
 .../lc-note/multiple-binary-corefile/Makefile |  4 ++--
 .../macCatalystAppMacOSFramework/Makefile     |  2 +-
 lldb/test/API/macosx/skinny-corefile/Makefile |  4 ++--
 .../API/tools/lldb-dap/breakpoint/Makefile    |  2 +-
 .../tools/lldb-server/libraries-svr4/Makefile |  4 ++--
 30 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/lldb/test/API/commands/expression/top-level/Makefile b/lldb/test/API/commands/expression/top-level/Makefile
index e5e9e78d4ead2a..51b27ddbb3c2f9 100644
--- a/lldb/test/API/commands/expression/top-level/Makefile
+++ b/lldb/test/API/commands/expression/top-level/Makefile
@@ -5,6 +5,6 @@ all: dummy
 include Makefile.rules
 
 dummy: dummy.cpp
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		CXX_SOURCES=dummy.cpp EXE=dummy
 
diff --git a/lldb/test/API/commands/expression/weak_symbols/Makefile b/lldb/test/API/commands/expression/weak_symbols/Makefile
index 6fd8133312ade6..1636e9b3032632 100644
--- a/lldb/test/API/commands/expression/weak_symbols/Makefile
+++ b/lldb/test/API/commands/expression/weak_symbols/Makefile
@@ -9,12 +9,12 @@ a.out: libdylib.dylib
 include Makefile.rules
 
 libdylib.dylib: dylib.c
-	$(MAKE) -C $(BUILDDIR) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -C $(BUILDDIR) -f $(MAKEFILE_RULES) \
 		C_SOURCES= DYLIB_C_SOURCES=dylib.c DYLIB_NAME=dylib \
 		CFLAGS_EXTRAS=-DHAS_THEM LD_EXTRAS=-dynamiclib
 
 hidden/libdylib.dylib:
 	mkdir hidden
-	$(MAKE) -C $(BUILDDIR)/hidden -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -C $(BUILDDIR)/hidden -f $(MAKEFILE_RULES) \
 		C_SOURCES= DYLIB_C_SOURCES=dylib.c DYLIB_NAME=dylib \
 		LD_EXTRAS=-dynamiclib
diff --git a/lldb/test/API/commands/target/create-deps/Makefile b/lldb/test/API/commands/target/create-deps/Makefile
index 3e5b1049b5a19f..866d550ee7d0ae 100644
--- a/lldb/test/API/commands/target/create-deps/Makefile
+++ b/lldb/test/API/commands/target/create-deps/Makefile
@@ -6,5 +6,5 @@ a.out: libload_a
 include Makefile.rules
 
 libload_a:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_NAME=load_a DYLIB_CXX_SOURCES=a.cpp
diff --git a/lldb/test/API/functionalities/breakpoint/break_in_loaded_dylib/Makefile b/lldb/test/API/functionalities/breakpoint/break_in_loaded_dylib/Makefile
index 0f3fb37bdadf3c..112210e7e2c603 100644
--- a/lldb/test/API/functionalities/breakpoint/break_in_loaded_dylib/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/break_in_loaded_dylib/Makefile
@@ -2,7 +2,7 @@ CXX_SOURCES := main.cpp
 USE_LIBDL := 1
 
 lib_b:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=lib_b
 all: lib_b
 
diff --git a/lldb/test/API/functionalities/dlopen_other_executable/Makefile b/lldb/test/API/functionalities/dlopen_other_executable/Makefile
index 113b9fd7d3f18e..51fc01bdde75a4 100644
--- a/lldb/test/API/functionalities/dlopen_other_executable/Makefile
+++ b/lldb/test/API/functionalities/dlopen_other_executable/Makefile
@@ -2,7 +2,7 @@ C_SOURCES := main.c
 USE_LIBDL := 1
 
 other:
-	$(MAKE) -f $(MAKEFILE_RULES) C_SOURCES=other.c EXE=other
+	"$(MAKE)" -f $(MAKEFILE_RULES) C_SOURCES=other.c EXE=other
 all: other
 
 include Makefile.rules
diff --git a/lldb/test/API/functionalities/exec/Makefile b/lldb/test/API/functionalities/exec/Makefile
index 8b9148ea8a3551..65d4680077d26d 100644
--- a/lldb/test/API/functionalities/exec/Makefile
+++ b/lldb/test/API/functionalities/exec/Makefile
@@ -5,5 +5,5 @@ all: secondprog
 include Makefile.rules
 
 secondprog: secondprog.cpp
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		CXX_SOURCES=secondprog.cpp EXE=secondprog
diff --git a/lldb/test/API/functionalities/jitloader_gdb/Makefile b/lldb/test/API/functionalities/jitloader_gdb/Makefile
index 357b1f83684fe2..9998cc9cf833c7 100644
--- a/lldb/test/API/functionalities/jitloader_gdb/Makefile
+++ b/lldb/test/API/functionalities/jitloader_gdb/Makefile
@@ -5,5 +5,5 @@ all: a.out simple
 include Makefile.rules
 
 simple:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		C_SOURCES=simple.c EXE=simple
diff --git a/lldb/test/API/functionalities/limit-debug-info/Makefile b/lldb/test/API/functionalities/limit-debug-info/Makefile
index 874b3a15e0fee9..fa867a7aeb7c6c 100644
--- a/lldb/test/API/functionalities/limit-debug-info/Makefile
+++ b/lldb/test/API/functionalities/limit-debug-info/Makefile
@@ -17,11 +17,11 @@ include Makefile.rules
 a.out: libone libtwo
 
 libone:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 	  DYLIB_ONLY=YES DYLIB_CXX_SOURCES=one.cpp DYLIB_NAME=one \
 	  CFLAGS_EXTRAS="$(ONE_CXXFLAGS)"
 
 libtwo: libone
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 	  DYLIB_ONLY=YES DYLIB_CXX_SOURCES=two.cpp DYLIB_NAME=two \
 	  CFLAGS_EXTRAS="$(TWO_CXXFLAGS)" LD_EXTRAS="-L. -lone"
diff --git a/lldb/test/API/functionalities/load_after_attach/Makefile b/lldb/test/API/functionalities/load_after_attach/Makefile
index 0f3fb37bdadf3c..112210e7e2c603 100644
--- a/lldb/test/API/functionalities/load_after_attach/Makefile
+++ b/lldb/test/API/functionalities/load_after_attach/Makefile
@@ -2,7 +2,7 @@ CXX_SOURCES := main.cpp
 USE_LIBDL := 1
 
 lib_b:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=lib_b
 all: lib_b
 
diff --git a/lldb/test/API/functionalities/load_lazy/Makefile b/lldb/test/API/functionalities/load_lazy/Makefile
index 81bc7dcb4d05f0..8e1d06b1e39c3f 100644
--- a/lldb/test/API/functionalities/load_lazy/Makefile
+++ b/lldb/test/API/functionalities/load_lazy/Makefile
@@ -17,13 +17,13 @@ else
 endif
 
 t1: t2_0
-	$(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_C_SOURCES=t1.c DYLIB_NAME=t1 LD_EXTRAS="-L. $(LINKFLAGS)"
 
 t2_0:
-	$(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_C_SOURCES=t2_0.c DYLIB_NAME=t2_0
 
 t2_1:
-	$(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_C_SOURCES=t2_1.c DYLIB_NAME=t2_1
diff --git a/lldb/test/API/functionalities/load_unload/Makefile b/lldb/test/API/functionalities/load_unload/Makefile
index e73ec731087641..dd7d16029427a1 100644
--- a/lldb/test/API/functionalities/load_unload/Makefile
+++ b/lldb/test/API/functionalities/load_unload/Makefile
@@ -7,25 +7,25 @@ a.out: lib_b lib_a lib_c lib_d hidden_lib_d
 include Makefile.rules
 
 lib_a: lib_b
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=loadunload_a \
 		LD_EXTRAS="-L. -lloadunload_b"
 
 lib_b:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=loadunload_b
 
 lib_c:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=c.cpp DYLIB_NAME=loadunload_c
 
 lib_d:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=loadunload_d
 ifeq ($(OS),Darwin)
 	install_name_tool -id @executable_path/libloadunload_d.dylib libloadunload_d.dylib
 endif
 
 hidden_lib_d: hidden
-	$(MAKE) VPATH=$(SRCDIR)/hidden -C hidden -f $(MAKEFILE_RULES) \
+	"$(MAKE)" VPATH=$(SRCDIR)/hidden -C hidden -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=loadunload_d
diff --git a/lldb/test/API/functionalities/load_using_paths/Makefile b/lldb/test/API/functionalities/load_using_paths/Makefile
index 814a96013756a9..f973a389d585f3 100644
--- a/lldb/test/API/functionalities/load_using_paths/Makefile
+++ b/lldb/test/API/functionalities/load_using_paths/Makefile
@@ -6,6 +6,6 @@ all: hidden_lib a.out
 include Makefile.rules
 
 hidden_lib:
-	$(MAKE) VPATH=$(SRCDIR)/hidden -C hidden -f $(MAKEFILE_RULES) \
+	"$(MAKE)" VPATH=$(SRCDIR)/hidden -C hidden -f $(MAKEFILE_RULES) \
 	DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=loadunload
 
diff --git a/lldb/test/API/functionalities/scripted_process/Makefile b/lldb/test/API/functionalities/scripted_process/Makefile
index ba739451fc7ef3..d4f12fbb3c4ef4 100644
--- a/lldb/test/API/functionalities/scripted_process/Makefile
+++ b/lldb/test/API/functionalities/scripted_process/Makefile
@@ -9,7 +9,7 @@ CXXFLAGS_EXTRAS := -target $(TRIPLE)
 all: libbaz.dylib a.out
 
 libbaz.dylib: baz.cpp
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_NAME=baz DYLIB_CXX_SOURCES=baz.cpp
 
 include Makefile.rules
diff --git a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile
index 4abcab84eac292..e4b0e86c0c36c8 100644
--- a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile
+++ b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile
@@ -6,11 +6,11 @@ a.out: lib_a lib_b
 include Makefile.rules
 
 lib_a:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=load_a
 
 lib_b:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=load_b
 
 
diff --git a/lldb/test/API/functionalities/tail_call_frames/cross_dso/Makefile b/lldb/test/API/functionalities/tail_call_frames/cross_dso/Makefile
index 42c010be9a03a5..963ce2ac94d92a 100644
--- a/lldb/test/API/functionalities/tail_call_frames/cross_dso/Makefile
+++ b/lldb/test/API/functionalities/tail_call_frames/cross_dso/Makefile
@@ -10,4 +10,4 @@ a.out: lib_One lib_Two
 lib_One: lib_Two
 
 lib_%:
-	$(MAKE) VPATH=$(SRCDIR)/$* -I $(SRCDIR) -f $(SRCDIR)/$*.mk DSYMUTIL=$(DSYMUTIL)
+	"$(MAKE)" VPATH=$(SRCDIR)/$* -I $(SRCDIR) -f $(SRCDIR)/$*.mk DSYMUTIL=$(DSYMUTIL)
diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/Makefile b/lldb/test/API/functionalities/target-new-solib-notifications/Makefile
index 6c61d210eeb2f3..e3b48697fd7837 100644
--- a/lldb/test/API/functionalities/target-new-solib-notifications/Makefile
+++ b/lldb/test/API/functionalities/target-new-solib-notifications/Makefile
@@ -1,23 +1,23 @@
 CXX_SOURCES := main.cpp
-LD_EXTRAS := -L. -l_d -l_c -l_a -l_b
+LD_EXTRAS := -L. -l_d -l_c -l_a -l_b
 
 a.out: lib_b lib_a lib_c lib_d
 
 include Makefile.rules
 
 lib_a: lib_b
-	$(MAKE) -f $(MAKEFILE_RULES) \
-		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=_a \
-		LD_EXTRAS="-L. -l_b"
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
+		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=_a \
+		LD_EXTRAS="-L. -l_b"
 
 lib_b:
-	$(MAKE) -f $(MAKEFILE_RULES) \
-		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=_b
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
+		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=_b
 
 lib_c:
-	$(MAKE) -f $(MAKEFILE_RULES) \
-		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=c.cpp DYLIB_NAME=_c
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
+		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=c.cpp DYLIB_NAME=_c
 
 lib_d:
-	$(MAKE) -f $(MAKEFILE_RULES) \
-		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=_d
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
+		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=_d
diff --git a/lldb/test/API/lang/c/conflicting-symbol/Makefile b/lldb/test/API/lang/c/conflicting-symbol/Makefile
index 81594a1265da24..1331c4e1ebfaf2 100644
--- a/lldb/test/API/lang/c/conflicting-symbol/Makefile
+++ b/lldb/test/API/lang/c/conflicting-symbol/Makefile
@@ -7,4 +7,4 @@ include Makefile.rules
 a.out: lib_One lib_Two
 
 lib_%:
-	$(MAKE) VPATH=$(SRCDIR)/$* -I $(SRCDIR) -f $(SRCDIR)/$*.mk
+	"$(MAKE)" VPATH=$(SRCDIR)/$* -I $(SRCDIR) -f $(SRCDIR)/$*.mk
diff --git a/lldb/test/API/lang/cpp/incomplete-types/Makefile b/lldb/test/API/lang/cpp/incomplete-types/Makefile
index f42ac2e81cc76e..0cf3f6a31caa2f 100644
--- a/lldb/test/API/lang/cpp/incomplete-types/Makefile
+++ b/lldb/test/API/lang/cpp/incomplete-types/Makefile
@@ -16,7 +16,7 @@ main.o: CFLAGS_EXTRAS = -flimit-debug-info
 
 limit: a.o main.o
 	mkdir -p build_limit
-	$(MAKE) -C $(BUILDDIR)/build_limit -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -C $(BUILDDIR)/build_limit -f $(MAKEFILE_RULES) \
 		EXE=../limit CXX_SOURCES="length.cpp ../a.o ../main.o" \
 		CFLAGS_EXTRAS=-flimit-debug-info NO_LIMIT_DEBUG_INFO_FLAGS=""
 
diff --git a/lldb/test/API/lang/cpp/namespace_definitions/Makefile b/lldb/test/API/lang/cpp/namespace_definitions/Makefile
index fc9165f67f428e..b17d70fc928710 100644
--- a/lldb/test/API/lang/cpp/namespace_definitions/Makefile
+++ b/lldb/test/API/lang/cpp/namespace_definitions/Makefile
@@ -6,10 +6,10 @@ a.out: liba libb
 include Makefile.rules
 
 liba:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_NAME=a DYLIB_CXX_SOURCES=a.cpp
 
 libb:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_NAME=b DYLIB_CXX_SOURCES=b.cpp
 
diff --git a/lldb/test/API/lang/objc/conflicting-definition/Makefile b/lldb/test/API/lang/objc/conflicting-definition/Makefile
index 00a0769a086f08..cba79c94d46baa 100644
--- a/lldb/test/API/lang/objc/conflicting-definition/Makefile
+++ b/lldb/test/API/lang/objc/conflicting-definition/Makefile
@@ -9,14 +9,14 @@ include Makefile.rules
 
 libTest.dylib:	Test/Test.m
 	mkdir -p Test
-	$(MAKE) MAKE_DSYM=YES -f $(MAKEFILE_RULES) \
+	"$(MAKE)" MAKE_DSYM=YES -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_NAME=Test DYLIB_OBJC_SOURCES=Test/Test.m \
 		LD_EXTRAS="-lobjc -framework Foundation" \
 		CFLAGS_EXTRAS=-I$(SRCDIR)
 
 libTestExt.dylib: TestExt/TestExt.m
 	mkdir -p TestExt
-	$(MAKE) MAKE_DSYM=YES -f $(MAKEFILE_RULES) \
+	"$(MAKE)" MAKE_DSYM=YES -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_NAME=TestExt DYLIB_OBJC_SOURCES=TestExt/TestExt.m \
 		LD_EXTRAS="-lobjc -framework Foundation -lTest -L." \
 		CFLAGS_EXTRAS=-I$(SRCDIR)
diff --git a/lldb/test/API/lang/objc/modules-hash-mismatch/Makefile b/lldb/test/API/lang/objc/modules-hash-mismatch/Makefile
index 59bf009f68677b..57da670b69ab33 100644
--- a/lldb/test/API/lang/objc/modules-hash-mismatch/Makefile
+++ b/lldb/test/API/lang/objc/modules-hash-mismatch/Makefile
@@ -5,7 +5,7 @@ USE_PRIVATE_MODULE_CACHE = YES
 .PHONY: update-module
 
 all: $(EXE)
-	$(MAKE) -f $(SRCDIR)/Makefile update-module
+	"$(MAKE)" -f $(SRCDIR)/Makefile update-module
 
 include Makefile.rules
 
diff --git a/lldb/test/API/macosx/delay-init-dependency/Makefile b/lldb/test/API/macosx/delay-init-dependency/Makefile
index 246ea0f34e1a1c..7421c68b79baa4 100644
--- a/lldb/test/API/macosx/delay-init-dependency/Makefile
+++ b/lldb/test/API/macosx/delay-init-dependency/Makefile
@@ -7,5 +7,5 @@ all: build-libfoo a.out
 include Makefile.rules
 
 build-libfoo: foo.c
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_C_SOURCES=foo.c DYLIB_NAME=foo DYLIB_ONLY=YES
diff --git a/lldb/test/API/macosx/expedited-thread-pcs/Makefile b/lldb/test/API/macosx/expedited-thread-pcs/Makefile
index 7799f06e770970..73a969831e6732 100644
--- a/lldb/test/API/macosx/expedited-thread-pcs/Makefile
+++ b/lldb/test/API/macosx/expedited-thread-pcs/Makefile
@@ -6,6 +6,6 @@ all: build-libfoo a.out
 include Makefile.rules
 
 build-libfoo: foo.c
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_C_SOURCES=foo.c DYLIB_NAME=foo DYLIB_ONLY=YES
 
diff --git a/lldb/test/API/macosx/indirect_symbol/Makefile b/lldb/test/API/macosx/indirect_symbol/Makefile
index 9069302b39c4f8..dee3e184fe19bf 100644
--- a/lldb/test/API/macosx/indirect_symbol/Makefile
+++ b/lldb/test/API/macosx/indirect_symbol/Makefile
@@ -7,11 +7,11 @@ all: build-libindirect build-libreepxoprt a.out
 include Makefile.rules
 
 build-libindirect: indirect.c
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_C_SOURCES=indirect.c DYLIB_NAME=indirect DYLIB_ONLY=YES \
 		LD_EXTRAS="-Wl,-image_base,0x200000000"
 
 build-libreepxoprt: reexport.c
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_C_SOURCES=reexport.c DYLIB_NAME=reexport DYLIB_ONLY=YES \
 		LD_EXTRAS="-L. -lindirect -Wl,-alias_list,$(SRCDIR)/alias.list"
diff --git a/lldb/test/API/macosx/lc-note/kern-ver-str/Makefile b/lldb/test/API/macosx/lc-note/kern-ver-str/Makefile
index 05d9552a802091..01b4acfdcfd2a1 100644
--- a/lldb/test/API/macosx/lc-note/kern-ver-str/Makefile
+++ b/lldb/test/API/macosx/lc-note/kern-ver-str/Makefile
@@ -5,7 +5,7 @@ C_SOURCES := main.c
 all: a.out create-empty-corefile
 
 create-empty-corefile:
-	$(MAKE) -f $(MAKEFILE_RULES) EXE=create-empty-corefile \
+	"$(MAKE)" -f $(MAKEFILE_RULES) EXE=create-empty-corefile \
 	    CXX=$(CC) CXX_SOURCES=create-empty-corefile.cpp
 
 include Makefile.rules
diff --git a/lldb/test/API/macosx/lc-note/multiple-binary-corefile/Makefile b/lldb/test/API/macosx/lc-note/multiple-binary-corefile/Makefile
index 8e561f17383fe1..229235cda999f9 100644
--- a/lldb/test/API/macosx/lc-note/multiple-binary-corefile/Makefile
+++ b/lldb/test/API/macosx/lc-note/multiple-binary-corefile/Makefile
@@ -10,11 +10,11 @@ create-empty-corefile:
 		CXX_SOURCES=create-multibin-corefile.cpp
 
 libone.dylib: one.c
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_NAME=one DYLIB_C_SOURCES=one.c
 
 libtwo.dylib: two.c
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_NAME=two DYLIB_C_SOURCES=two.c
 
 include Makefile.rules
diff --git a/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile b/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile
index c77a186724fda1..0dc9e71c3276a3 100644
--- a/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile
+++ b/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile
@@ -11,7 +11,7 @@ override CC=xcrun clang
 all: libfoo.dylib a.out
 
 libfoo.dylib: foo.c
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_NAME=foo DYLIB_C_SOURCES=foo.c
 
 include Makefile.rules
diff --git a/lldb/test/API/macosx/skinny-corefile/Makefile b/lldb/test/API/macosx/skinny-corefile/Makefile
index efe37f3d2b8b2f..fce43a36c33ac1 100644
--- a/lldb/test/API/macosx/skinny-corefile/Makefile
+++ b/lldb/test/API/macosx/skinny-corefile/Makefile
@@ -6,10 +6,10 @@ include Makefile.rules
 a.out: libto-be-removed libpresent
 
 libto-be-removed: libpresent
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 	  DYLIB_ONLY=YES DYLIB_C_SOURCES=to-be-removed.c DYLIB_NAME=to-be-removed \
 	  LD_EXTRAS="-L. -lpresent"
 
 libpresent:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 	  DYLIB_ONLY=YES DYLIB_C_SOURCES=present.c DYLIB_NAME=present 
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/Makefile b/lldb/test/API/tools/lldb-dap/breakpoint/Makefile
index 30a6400184933d..7634f513e85233 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/Makefile
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/Makefile
@@ -15,5 +15,5 @@ main-copy.cpp: main.cpp
 
 # The following shared library will be used to test breakpoints under dynamic loading
 libother:  other-copy.c
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_C_SOURCES=other-copy.c DYLIB_NAME=other 
diff --git a/lldb/test/API/tools/lldb-server/libraries-svr4/Makefile b/lldb/test/API/tools/lldb-server/libraries-svr4/Makefile
index 5b5c1dcef783a5..f13b1ac15928a3 100644
--- a/lldb/test/API/tools/lldb-server/libraries-svr4/Makefile
+++ b/lldb/test/API/tools/lldb-server/libraries-svr4/Makefile
@@ -9,11 +9,11 @@ a.out: svr4lib_a svr4lib_b_quote
 include Makefile.rules
 
 svr4lib_a:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_NAME=svr4lib_a DYLIB_CXX_SOURCES=svr4lib_a.cpp \
 		DYLIB_ONLY=YES
 
 svr4lib_b_quote:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_NAME=svr4lib_b\\\" DYLIB_CXX_SOURCES=svr4lib_b_quote.cpp \
 		DYLIB_ONLY=YES

>From 09361953116770b646decf5820a9455ada2ba4fc Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Wed, 16 Oct 2024 19:13:52 +0800
Subject: [PATCH 44/82] [InstCombine] Drop `samesign` in InstCombine (#112480)

Closes https://github.com/llvm/llvm-project/issues/112476.
---
 .../InstCombine/InstCombineCompares.cpp          |  4 ++--
 .../Transforms/InstCombine/InstCombineSelect.cpp |  1 +
 .../Transforms/InstCombine/icmp-and-shift.ll     | 13 +++++++++++++
 .../Transforms/InstCombine/icmp-equality-test.ll | 16 ++++++++++++++++
 llvm/test/Transforms/InstCombine/icmp.ll         | 15 +++++++++++++++
 5 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 7129499e0f8f9d..18a6fdcec1728e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1738,7 +1738,7 @@ Instruction *InstCombinerImpl::foldICmpAndShift(ICmpInst &Cmp,
 
     // Compute X & (C2 << Y).
     Value *NewAnd = Builder.CreateAnd(Shift->getOperand(0), NewShift);
-    return replaceOperand(Cmp, 0, NewAnd);
+    return new ICmpInst(Cmp.getPredicate(), NewAnd, Cmp.getOperand(1));
   }
 
   return nullptr;
@@ -1844,7 +1844,7 @@ Instruction *InstCombinerImpl::foldICmpAndConstConst(ICmpInst &Cmp,
                                                /*HasNUW=*/true),
                              One, Or->getName());
         Value *NewAnd = Builder.CreateAnd(A, NewOr, And->getName());
-        return replaceOperand(Cmp, 0, NewAnd);
+        return new ICmpInst(Cmp.getPredicate(), NewAnd, Cmp.getOperand(1));
       }
     }
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 8be2eeed84adfc..623694663aa1be 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1448,6 +1448,7 @@ Instruction *InstCombinerImpl::foldSelectEqualityTest(SelectInst &Sel) {
              m_c_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(X), m_Specific(Y))))
     return nullptr;
 
+  cast<ICmpInst>(XeqY)->setSameSign(false);
   return replaceInstUsesWith(Sel, XeqY);
 }
 
diff --git a/llvm/test/Transforms/InstCombine/icmp-and-shift.ll b/llvm/test/Transforms/InstCombine/icmp-and-shift.ll
index 684ece21b1166e..d092363309fec0 100644
--- a/llvm/test/Transforms/InstCombine/icmp-and-shift.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-and-shift.ll
@@ -619,6 +619,19 @@ define i1 @test_shr_and_1_ne_0(i32 %a, i32 %b) {
   ret i1 %cmp
 }
 
+define i1 @test_shr_and_1_ne_0_samesign(i32 %a, i32 %b) {
+; CHECK-LABEL: @test_shr_and_1_ne_0_samesign(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i32 1, [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[A:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr i32 %a, %b
+  %and = and i32 %shr, 1
+  %cmp = icmp samesign ne i32 %and, 0
+  ret i1 %cmp
+}
+
 define i1 @test_const_shr_and_1_ne_0(i32 %b) {
 ; CHECK-LABEL: @test_const_shr_and_1_ne_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i32 1, [[B:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/icmp-equality-test.ll b/llvm/test/Transforms/InstCombine/icmp-equality-test.ll
index c2740ca7fe8aa9..b9d8f2d54def3d 100644
--- a/llvm/test/Transforms/InstCombine/icmp-equality-test.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-equality-test.ll
@@ -33,6 +33,22 @@ entry:
   ret i1 %equal
 }
 
+define i1 @icmp_equality_test_constant_samesign(i42 %X, i42 %Y) {
+; CHECK-LABEL: @icmp_equality_test_constant_samesign(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XEQY:%.*]] = icmp eq i42 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[XEQY]]
+;
+entry:
+  %XeqC = icmp eq i42 %X, -42
+  %YeqC = icmp eq i42 %Y, -42
+  %XeqY = icmp samesign eq i42 %X, %Y
+  %not.YeqC = xor i1 %YeqC, true
+  %and = select i1 %not.YeqC, i1 %XeqY, i1 false
+  %equal = select i1 %XeqC, i1 %YeqC, i1 %and
+  ret i1 %equal
+}
+
 define i1 @icmp_equality_test_swift_optional_pointers(i64 %X, i64 %Y) {
 ; CHECK-LABEL: @icmp_equality_test_swift_optional_pointers(
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
index 5e80134b153be7..7cafb4885ff0ee 100644
--- a/llvm/test/Transforms/InstCombine/icmp.ll
+++ b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -3203,6 +3203,21 @@ define i1 @icmp_and_or_lshr(i32 %x, i32 %y) {
   ret i1 %ret
 }
 
+define i1 @icmp_and_or_lshr_samesign(i32 %x, i32 %y) {
+; CHECK-LABEL: @icmp_and_or_lshr_samesign(
+; CHECK-NEXT:    [[SHF1:%.*]] = shl nuw i32 1, [[Y:%.*]]
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHF1]], 1
+; CHECK-NEXT:    [[AND3:%.*]] = and i32 [[X:%.*]], [[OR2]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne i32 [[AND3]], 0
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %shf = lshr i32 %x, %y
+  %or = or i32 %shf, %x
+  %and = and i32 %or, 1
+  %ret = icmp samesign ne i32 %and, 0
+  ret i1 %ret
+}
+
 define <2 x i1> @icmp_and_or_lshr_vec(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @icmp_and_or_lshr_vec(
 ; CHECK-NEXT:    [[SHF:%.*]] = lshr <2 x i32> [[X:%.*]], [[Y:%.*]]

>From 682fa797b7358733df9e439241a9ef2906003adf Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Wed, 16 Oct 2024 12:44:09 +0100
Subject: [PATCH 45/82] InstCombine/Select: remove redundant code (NFC)
 (#112388)

InstCombinerImpl::foldSelectInstWithICmp has some inlined code for
select-icmp-xor simplification, but this simplification is already done
by other code, via another path:

  (X & Y) == 0 ? X : X ^ Y ->
  ((X & Y) == 0 ? 0 : Y) ^ X ->
  (X & Y) ^ X ->
  X & ~Y

Cover the cases that it claims to simplify, and demonstrate that
stripping it doesn't cause test changes.
---
 .../InstCombine/InstCombineSelect.cpp         |  50 -----
 .../Transforms/InstCombine/select-icmp-xor.ll | 190 ++++++++++++++++++
 2 files changed, 190 insertions(+), 50 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/select-icmp-xor.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 623694663aa1be..ed44f0596f3272 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1954,56 +1954,6 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
     return &SI;
   }
 
-  // FIXME: This code is nearly duplicated in InstSimplify. Using/refactoring
-  // decomposeBitTestICmp() might help.
-  if (TrueVal->getType()->isIntOrIntVectorTy()) {
-    unsigned BitWidth =
-        DL.getTypeSizeInBits(TrueVal->getType()->getScalarType());
-    APInt MinSignedValue = APInt::getSignedMinValue(BitWidth);
-    Value *X;
-    const APInt *Y, *C;
-    bool TrueWhenUnset;
-    bool IsBitTest = false;
-    if (ICmpInst::isEquality(Pred) &&
-        match(CmpLHS, m_And(m_Value(X), m_Power2(Y))) &&
-        match(CmpRHS, m_Zero())) {
-      IsBitTest = true;
-      TrueWhenUnset = Pred == ICmpInst::ICMP_EQ;
-    } else if (Pred == ICmpInst::ICMP_SLT && match(CmpRHS, m_Zero())) {
-      X = CmpLHS;
-      Y = &MinSignedValue;
-      IsBitTest = true;
-      TrueWhenUnset = false;
-    } else if (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_AllOnes())) {
-      X = CmpLHS;
-      Y = &MinSignedValue;
-      IsBitTest = true;
-      TrueWhenUnset = true;
-    }
-    if (IsBitTest) {
-      Value *V = nullptr;
-      // (X & Y) == 0 ? X : X ^ Y  --> X & ~Y
-      if (TrueWhenUnset && TrueVal == X &&
-          match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
-        V = Builder.CreateAnd(X, ~(*Y));
-      // (X & Y) != 0 ? X ^ Y : X  --> X & ~Y
-      else if (!TrueWhenUnset && FalseVal == X &&
-               match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
-        V = Builder.CreateAnd(X, ~(*Y));
-      // (X & Y) == 0 ? X ^ Y : X  --> X | Y
-      else if (TrueWhenUnset && FalseVal == X &&
-               match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
-        V = Builder.CreateOr(X, *Y);
-      // (X & Y) != 0 ? X : X ^ Y  --> X | Y
-      else if (!TrueWhenUnset && TrueVal == X &&
-               match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
-        V = Builder.CreateOr(X, *Y);
-
-      if (V)
-        return replaceInstUsesWith(SI, V);
-    }
-  }
-
   if (Instruction *V =
           foldSelectICmpAndAnd(SI.getType(), ICI, TrueVal, FalseVal, Builder))
     return V;
diff --git a/llvm/test/Transforms/InstCombine/select-icmp-xor.ll b/llvm/test/Transforms/InstCombine/select-icmp-xor.ll
new file mode 100644
index 00000000000000..c8ce114a683eee
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/select-icmp-xor.ll
@@ -0,0 +1,190 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=instcombine -S %s | FileCheck %s
+
+define i8 @select_icmp_eq_pow2(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_eq_pow2(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = and i8 [[X]], -5
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %and = and i8 %x, 4
+  %icmp = icmp eq i8 %and, 0
+  %xor = xor i8 %x, 4
+  %sel = select i1 %icmp, i8 %x, i8 %xor
+  ret i8 %sel
+}
+
+define i8 @select_icmp_eq_pow2_flipped(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_eq_pow2_flipped(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = or i8 [[X]], 4
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %and = and i8 %x, 4
+  %icmp = icmp eq i8 %and, 0
+  %xor = xor i8 %x, 4
+  %sel = select i1 %icmp, i8 %xor, i8 %x
+  ret i8 %sel
+}
+
+define i8 @select_icmp_eq_not_pow2(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_eq_not_pow2(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], 5
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i8 [[AND]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X]], 5
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[ICMP]], i8 [[X]], i8 [[XOR]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %and = and i8 %x, 5
+  %icmp = icmp eq i8 %and, 0
+  %xor = xor i8 %x, 5
+  %sel = select i1 %icmp, i8 %x, i8 %xor
+  ret i8 %sel
+}
+
+define i8 @select_icmp_ne_pow2(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_ne_pow2(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = and i8 [[X]], -5
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %and = and i8 %x, 4
+  %icmp = icmp ne i8 %and, 0
+  %xor = xor i8 %x, 4
+  %sel = select i1 %icmp, i8 %xor, i8 %x
+  ret i8 %sel
+}
+
+define i8 @select_icmp_ne_pow2_flipped(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_ne_pow2_flipped(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = or i8 [[X]], 4
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %and = and i8 %x, 4
+  %icmp = icmp ne i8 %and, 0
+  %xor = xor i8 %x, 4
+  %sel = select i1 %icmp, i8 %x, i8 %xor
+  ret i8 %sel
+}
+
+define i8 @select_icmp_ne_not_pow2(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_ne_not_pow2(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], 5
+; CHECK-NEXT:    [[ICMP_NOT:%.*]] = icmp eq i8 [[AND]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X]], 5
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[ICMP_NOT]], i8 [[X]], i8 [[XOR]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %and = and i8 %x, 5
+  %icmp = icmp ne i8 %and, 0
+  %xor = xor i8 %x, 5
+  %sel = select i1 %icmp, i8 %xor, i8 %x
+  ret i8 %sel
+}
+
+define i8 @select_icmp_slt_zero_smin(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_slt_zero_smin(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = or i8 [[X]], -128
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %icmp = icmp slt i8 %x, 0
+  %xor = xor i8 %x, -128
+  %sel = select i1 %icmp, i8 %x, i8 %xor
+  ret i8 %sel
+}
+
+define i8 @select_icmp_slt_zero_smin_flipped(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_slt_zero_smin_flipped(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = and i8 [[X]], 127
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %icmp = icmp slt i8 %x, 0
+  %xor = xor i8 %x, -128
+  %sel = select i1 %icmp, i8 %xor, i8 %x
+  ret i8 %sel
+}
+
+define i8 @select_icmp_slt_not_zero(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_slt_not_zero(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp slt i8 [[X]], 1
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X]], -128
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[ICMP]], i8 [[X]], i8 [[XOR]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %icmp = icmp slt i8 %x, 1
+  %xor = xor i8 %x, -128
+  %sel = select i1 %icmp, i8 %x, i8 %xor
+  ret i8 %sel
+}
+
+define i8 @select_icmp_slt_not_smin(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_slt_not_smin(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp slt i8 [[X]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X]], -127
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[ICMP]], i8 [[X]], i8 [[XOR]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %icmp = icmp slt i8 %x, 0
+  %xor = xor i8 %x, -127
+  %sel = select i1 %icmp, i8 %x, i8 %xor
+  ret i8 %sel
+}
+
+define i8 @select_icmp_sgt_allones_smin(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_sgt_allones_smin(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = and i8 [[X]], 127
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %icmp = icmp sgt i8 %x, 255
+  %xor = xor i8 %x, -128
+  %sel = select i1 %icmp, i8 %x, i8 %xor
+  ret i8 %sel
+}
+
+define i8 @select_icmp_sgt_allones_smin_flipped(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_sgt_allones_smin_flipped(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = or i8 [[X]], -128
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %icmp = icmp sgt i8 %x, 255
+  %xor = xor i8 %x, -128
+  %sel = select i1 %icmp, i8 %xor, i8 %x
+  ret i8 %sel
+}
+
+define i8 @select_icmp_sgt_not_allones(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_sgt_not_allones(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp sgt i8 [[X]], -2
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X]], -128
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[ICMP]], i8 [[X]], i8 [[XOR]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %icmp = icmp sgt i8 %x, 254
+  %xor = xor i8 %x, -128
+  %sel = select i1 %icmp, i8 %x, i8 %xor
+  ret i8 %sel
+}
+
+define i8 @select_icmp_sgt_not_smin(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_sgt_not_smin(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X]], -127
+; CHECK-NEXT:    [[ICMP1:%.*]] = icmp slt i8 [[X]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[ICMP1]], i8 [[XOR]], i8 [[X]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %icmp = icmp sgt i8 %x, 255
+  %xor = xor i8 %x, -127
+  %sel = select i1 %icmp, i8 %x, i8 %xor
+  ret i8 %sel
+}

>From f3e804b9fd561c0da970536643e2a5cd6c3d4215 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Wed, 16 Oct 2024 13:48:38 +0200
Subject: [PATCH 46/82] [analyzer][clang-tidy][NFC] Clean up eagerly-assume
 handling (#112209)

This commit is a collection of several very minor code quality
improvements. The main goal is removing the misleading "Bin" substring
from the names of several methods and variables (like
`evalEagerlyAssumedBinOpBifurcation`) that are also applied for the
unary logical not operator.

In addition to this, I clarified the doc-comment of the method
`evalEagerlyAssumedBinOpBifurcation` and refactored the body of this
method to fix the capitalization of variable names and replace an
obsolete use of `std::tie` with a structured binding.

Finally, the data member `eagerlyAssumeBinOpBifurcation` of the class
`AnalyzerOptions` was completely removed (including a line in clang-tidy
that sets it to true), because it was never read by any code.

Note that the eagerly-assume mode of the analyzer is controlled by a
different boolean member of `AnalyzerOptions` which is called
`ShouldEagerlyAssume` and is defined via the macro magic from
`AnalyzerOptions.def`.
---
 clang-tools-extra/clang-tidy/ClangTidy.cpp    |  1 -
 .../StaticAnalyzer/Core/AnalyzerOptions.def   | 13 +++---
 .../StaticAnalyzer/Core/AnalyzerOptions.h     |  8 ++--
 .../Core/PathSensitive/ExprEngine.h           | 11 +++--
 .../Core/BugReporterVisitors.cpp              |  2 +-
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp  | 41 ++++++++-----------
 6 files changed, 33 insertions(+), 43 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/ClangTidy.cpp b/clang-tools-extra/clang-tidy/ClangTidy.cpp
index 62f9d19b2a362f..c4cac7d27b77c2 100644
--- a/clang-tools-extra/clang-tidy/ClangTidy.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidy.cpp
@@ -458,7 +458,6 @@ ClangTidyASTConsumerFactory::createASTConsumer(
   if (!AnalyzerOptions.CheckersAndPackages.empty()) {
     setStaticAnalyzerCheckerOpts(Context.getOptions(), AnalyzerOptions);
     AnalyzerOptions.AnalysisDiagOpt = PD_NONE;
-    AnalyzerOptions.eagerlyAssumeBinOpBifurcation = true;
     std::unique_ptr<ento::AnalysisASTConsumer> AnalysisConsumer =
         ento::CreateAnalysisConsumer(Compiler);
     AnalysisConsumer->AddDiagnosticConsumer(
diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
index 737bc8e86cfb6a..ad2dbffe88326d 100644
--- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
+++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
@@ -299,13 +299,12 @@ ANALYZER_OPTION(
 
 ANALYZER_OPTION(
     bool, ShouldEagerlyAssume, "eagerly-assume",
-    "Whether we should eagerly assume evaluations of conditionals, thus, "
-    "bifurcating the path. This indicates how the engine should handle "
-    "expressions such as: 'x = (y != 0)'. When this is true then the "
-    "subexpression 'y != 0' will be eagerly assumed to be true or false, thus "
-    "evaluating it to the integers 0 or 1 respectively. The upside is that "
-    "this can increase analysis precision until we have a better way to lazily "
-    "evaluate such logic. The downside is that it eagerly bifurcates paths.",
+    "If this is enabled (the default behavior), when the analyzer encounters "
+    "a comparison operator or logical negation, it immediately splits the "
+    "state to separate the case when the expression is true and the case when "
+    "it's false. The upside is that this can increase analysis precision until "
+    "we have a better way to lazily evaluate such logic; the downside is that "
+    "it eagerly bifurcates paths.",
     true)
 
 ANALYZER_OPTION(
diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
index 3a3c1a13d67dd5..2f4cd277cccdc6 100644
--- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
+++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
@@ -229,8 +229,6 @@ class AnalyzerOptions : public RefCountedBase<AnalyzerOptions> {
   unsigned AnalyzerDisplayProgress : 1;
   unsigned AnalyzerNoteAnalysisEntryPoints : 1;
 
-  unsigned eagerlyAssumeBinOpBifurcation : 1;
-
   unsigned TrimGraph : 1;
   unsigned visualizeExplodedGraphWithGraphViz : 1;
   unsigned UnoptimizedCFG : 1;
@@ -293,9 +291,9 @@ class AnalyzerOptions : public RefCountedBase<AnalyzerOptions> {
         ShowConfigOptionsList(false),
         ShouldEmitErrorsOnInvalidConfigValue(false), AnalyzeAll(false),
         AnalyzerDisplayProgress(false), AnalyzerNoteAnalysisEntryPoints(false),
-        eagerlyAssumeBinOpBifurcation(false), TrimGraph(false),
-        visualizeExplodedGraphWithGraphViz(false), UnoptimizedCFG(false),
-        PrintStats(false), NoRetryExhausted(false), AnalyzerWerror(false) {}
+        TrimGraph(false), visualizeExplodedGraphWithGraphViz(false),
+        UnoptimizedCFG(false), PrintStats(false), NoRetryExhausted(false),
+        AnalyzerWerror(false) {}
 
   /// Interprets an option's string value as a boolean. The "true" string is
   /// interpreted as true and the "false" string is interpreted as false.
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
index 04eacd1df7ffe2..8c7493e27fcaa6 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
@@ -583,14 +583,13 @@ class ExprEngine {
                                 ExplodedNode *Pred,
                                 ExplodedNodeSet &Dst);
 
-  /// evalEagerlyAssumeBinOpBifurcation - Given the nodes in 'Src', eagerly assume symbolic
-  ///  expressions of the form 'x != 0' and generate new nodes (stored in Dst)
-  ///  with those assumptions.
-  void evalEagerlyAssumeBinOpBifurcation(ExplodedNodeSet &Dst, ExplodedNodeSet &Src,
-                         const Expr *Ex);
+  /// evalEagerlyAssumeBifurcation - Given the nodes in 'Src', eagerly assume
+  /// concrete boolean values for 'Ex', storing the resulting nodes in 'Dst'.
+  void evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, ExplodedNodeSet &Src,
+                                    const Expr *Ex);
 
   static std::pair<const ProgramPointTag *, const ProgramPointTag *>
-    geteagerlyAssumeBinOpBifurcationTags();
+  getEagerlyAssumeBifurcationTags();
 
   ProgramStateRef handleLValueBitCast(ProgramStateRef state, const Expr *Ex,
                                       const LocationContext *LCtx, QualType T,
diff --git a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
index 68c8a8dc682507..c4479db14b791d 100644
--- a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
+++ b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
@@ -2695,7 +2695,7 @@ ConditionBRVisitor::VisitNodeImpl(const ExplodedNode *N,
                                   PathSensitiveBugReport &BR) {
   ProgramPoint ProgPoint = N->getLocation();
   const std::pair<const ProgramPointTag *, const ProgramPointTag *> &Tags =
-      ExprEngine::geteagerlyAssumeBinOpBifurcationTags();
+      ExprEngine::getEagerlyAssumeBifurcationTags();
 
   // If an assumption was made on a branch, it should be caught
   // here by looking at the state transition.
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 43ab646d398b31..0e400dfadb8cf8 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -2129,7 +2129,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
           (B->isRelationalOp() || B->isEqualityOp())) {
         ExplodedNodeSet Tmp;
         VisitBinaryOperator(cast<BinaryOperator>(S), Pred, Tmp);
-        evalEagerlyAssumeBinOpBifurcation(Dst, Tmp, cast<Expr>(S));
+        evalEagerlyAssumeBifurcation(Dst, Tmp, cast<Expr>(S));
       }
       else
         VisitBinaryOperator(cast<BinaryOperator>(S), Pred, Dst);
@@ -2402,7 +2402,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
       if (AMgr.options.ShouldEagerlyAssume && (U->getOpcode() == UO_LNot)) {
         ExplodedNodeSet Tmp;
         VisitUnaryOperator(U, Pred, Tmp);
-        evalEagerlyAssumeBinOpBifurcation(Dst, Tmp, U);
+        evalEagerlyAssumeBifurcation(Dst, Tmp, U);
       }
       else
         VisitUnaryOperator(U, Pred, Dst);
@@ -3742,23 +3742,20 @@ void ExprEngine::evalLocation(ExplodedNodeSet &Dst,
   BldrTop.addNodes(Tmp);
 }
 
-std::pair<const ProgramPointTag *, const ProgramPointTag*>
-ExprEngine::geteagerlyAssumeBinOpBifurcationTags() {
-  static SimpleProgramPointTag
-         eagerlyAssumeBinOpBifurcationTrue(TagProviderName,
-                                           "Eagerly Assume True"),
-         eagerlyAssumeBinOpBifurcationFalse(TagProviderName,
-                                            "Eagerly Assume False");
-  return std::make_pair(&eagerlyAssumeBinOpBifurcationTrue,
-                        &eagerlyAssumeBinOpBifurcationFalse);
+std::pair<const ProgramPointTag *, const ProgramPointTag *>
+ExprEngine::getEagerlyAssumeBifurcationTags() {
+  static SimpleProgramPointTag TrueTag(TagProviderName, "Eagerly Assume True"),
+      FalseTag(TagProviderName, "Eagerly Assume False");
+
+  return std::make_pair(&TrueTag, &FalseTag);
 }
 
-void ExprEngine::evalEagerlyAssumeBinOpBifurcation(ExplodedNodeSet &Dst,
-                                                   ExplodedNodeSet &Src,
-                                                   const Expr *Ex) {
+void ExprEngine::evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst,
+                                              ExplodedNodeSet &Src,
+                                              const Expr *Ex) {
   StmtNodeBuilder Bldr(Src, Dst, *currBldrCtx);
 
-  for (const auto Pred : Src) {
+  for (ExplodedNode *Pred : Src) {
     // Test if the previous node was as the same expression.  This can happen
     // when the expression fails to evaluate to anything meaningful and
     // (as an optimization) we don't generate a node.
@@ -3767,28 +3764,26 @@ void ExprEngine::evalEagerlyAssumeBinOpBifurcation(ExplodedNodeSet &Dst,
       continue;
     }
 
-    ProgramStateRef state = Pred->getState();
-    SVal V = state->getSVal(Ex, Pred->getLocationContext());
+    ProgramStateRef State = Pred->getState();
+    SVal V = State->getSVal(Ex, Pred->getLocationContext());
     std::optional<nonloc::SymbolVal> SEV = V.getAs<nonloc::SymbolVal>();
     if (SEV && SEV->isExpression()) {
-      const std::pair<const ProgramPointTag *, const ProgramPointTag*> &tags =
-        geteagerlyAssumeBinOpBifurcationTags();
+      const auto &[TrueTag, FalseTag] = getEagerlyAssumeBifurcationTags();
 
-      ProgramStateRef StateTrue, StateFalse;
-      std::tie(StateTrue, StateFalse) = state->assume(*SEV);
+      auto [StateTrue, StateFalse] = State->assume(*SEV);
 
       // First assume that the condition is true.
       if (StateTrue) {
         SVal Val = svalBuilder.makeIntVal(1U, Ex->getType());
         StateTrue = StateTrue->BindExpr(Ex, Pred->getLocationContext(), Val);
-        Bldr.generateNode(Ex, Pred, StateTrue, tags.first);
+        Bldr.generateNode(Ex, Pred, StateTrue, TrueTag);
       }
 
       // Next, assume that the condition is false.
       if (StateFalse) {
         SVal Val = svalBuilder.makeIntVal(0U, Ex->getType());
         StateFalse = StateFalse->BindExpr(Ex, Pred->getLocationContext(), Val);
-        Bldr.generateNode(Ex, Pred, StateFalse, tags.second);
+        Bldr.generateNode(Ex, Pred, StateFalse, FalseTag);
       }
     }
   }

>From f113a66c29b17e4937ff5d0ab67dc087fa6ee27e Mon Sep 17 00:00:00 2001
From: Karl-Johan Karlsson <karl-johan.karlsson at ericsson.com>
Date: Wed, 16 Oct 2024 13:49:34 +0200
Subject: [PATCH 47/82] [ARM] Fix warnings in ARMAsmParser.cpp and
 ARMDisassembler.cpp (#112507)

Fix gcc warnings like:
ARMAsmParser.cpp:7168:46: warning: enumeral and non-enumeral type in
conditional expression [-Wextra]
---
 .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 31 +++++++++++--------
 .../ARM/Disassembler/ARMDisassembler.cpp      |  5 +--
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 75fb90477f8854..b908e4f367e1a3 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -2532,14 +2532,14 @@ class ARMOperand : public MCParsedAsmOperand {
   void addCondCodeOperands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createImm(unsigned(getCondCode())));
-    unsigned RegNum = getCondCode() == ARMCC::AL ? 0: ARM::CPSR;
+    unsigned RegNum = getCondCode() == ARMCC::AL ? ARM::NoRegister : ARM::CPSR;
     Inst.addOperand(MCOperand::createReg(RegNum));
   }
 
   void addVPTPredNOperands(MCInst &Inst, unsigned N) const {
     assert(N == 3 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createImm(unsigned(getVPTPred())));
-    unsigned RegNum = getVPTPred() == ARMVCC::None ? 0: ARM::P0;
+    unsigned RegNum = getVPTPred() == ARMVCC::None ? ARM::NoRegister : ARM::P0;
     Inst.addOperand(MCOperand::createReg(RegNum));
     Inst.addOperand(MCOperand::createReg(0));
   }
@@ -7164,8 +7164,8 @@ bool ARMAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // Add the carry setting operand, if necessary.
   if (CanAcceptCarrySet && CarrySetting) {
     SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size());
-    Operands.push_back(
-        ARMOperand::CreateCCOut(CarrySetting ? ARM::CPSR : 0, Loc, *this));
+    Operands.push_back(ARMOperand::CreateCCOut(
+        CarrySetting ? ARM::CPSR : ARM::NoRegister, Loc, *this));
   }
 
   // Add the predication code operand, if necessary.
@@ -10372,7 +10372,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
   case ARM::t2ASRri:
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
-        Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        Inst.getOperand(5).getReg() ==
+            (inITBlock() ? ARM::NoRegister : ARM::CPSR) &&
         !HasWideQualifier) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
@@ -10422,14 +10423,14 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     TmpInst.addOperand(Inst.getOperand(0)); // Rd
     if (isNarrow)
       TmpInst.addOperand(MCOperand::createReg(
-          Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : 0));
+          Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : ARM::NoRegister));
     TmpInst.addOperand(Inst.getOperand(1)); // Rn
     TmpInst.addOperand(Inst.getOperand(2)); // Rm
     TmpInst.addOperand(Inst.getOperand(4)); // CondCode
     TmpInst.addOperand(Inst.getOperand(5));
     if (!isNarrow)
       TmpInst.addOperand(MCOperand::createReg(
-          Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : 0));
+          Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : ARM::NoRegister));
     Inst = TmpInst;
     return true;
   }
@@ -10475,7 +10476,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     TmpInst.addOperand(Inst.getOperand(0)); // Rd
     if (isNarrow && !isMov)
       TmpInst.addOperand(MCOperand::createReg(
-          Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
+          Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : ARM::NoRegister));
     TmpInst.addOperand(Inst.getOperand(1)); // Rn
     if (newOpc != ARM::t2RRX && !isMov)
       TmpInst.addOperand(MCOperand::createImm(Amount));
@@ -10483,7 +10484,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     TmpInst.addOperand(Inst.getOperand(4));
     if (!isNarrow)
       TmpInst.addOperand(MCOperand::createReg(
-          Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
+          Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : ARM::NoRegister));
     Inst = TmpInst;
     return true;
   }
@@ -10684,7 +10685,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
         !isARMLowRegister(Inst.getOperand(0).getReg()) ||
         (Inst.getOperand(2).isImm() &&
          (unsigned)Inst.getOperand(2).getImm() > 255) ||
-        Inst.getOperand(5).getReg() != (inITBlock() ? 0 : ARM::CPSR) ||
+        Inst.getOperand(5).getReg() !=
+            (inITBlock() ? ARM::NoRegister : ARM::CPSR) ||
         HasWideQualifier)
       break;
     MCInst TmpInst;
@@ -10852,7 +10854,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         (Inst.getOperand(1).isImm() &&
          (unsigned)Inst.getOperand(1).getImm() <= 255) &&
-        Inst.getOperand(4).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        Inst.getOperand(4).getReg() ==
+            (inITBlock() ? ARM::NoRegister : ARM::CPSR) &&
         !HasWideQualifier) {
       // The operands aren't in the same order for tMOVi8...
       MCInst TmpInst;
@@ -10993,7 +10996,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
          isARMLowRegister(Inst.getOperand(2).getReg())) &&
         Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
-        Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        Inst.getOperand(5).getReg() ==
+            (inITBlock() ? ARM::NoRegister : ARM::CPSR) &&
         !HasWideQualifier) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
@@ -11029,7 +11033,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
          isARMLowRegister(Inst.getOperand(2).getReg())) &&
         (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() ||
          Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) &&
-        Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        Inst.getOperand(5).getReg() ==
+            (inITBlock() ? ARM::NoRegister : ARM::CPSR) &&
         !HasWideQualifier) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 93b74905fc59fc..fa5dd10cfdaa06 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -894,12 +894,13 @@ void ARMDisassembler::AddThumb1SBit(MCInst &MI, bool InITBlock) const {
         MCID.operands()[i].RegClass == ARM::CCRRegClassID) {
       if (i > 0 && MCID.operands()[i - 1].isPredicate())
         continue;
-      MI.insert(I, MCOperand::createReg(InITBlock ? 0 : ARM::CPSR));
+      MI.insert(I,
+                MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR));
       return;
     }
   }
 
-  MI.insert(I, MCOperand::createReg(InITBlock ? 0 : ARM::CPSR));
+  MI.insert(I, MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR));
 }
 
 bool ARMDisassembler::isVectorPredicable(const MCInst &MI) const {

>From caa7301bc8081bfaf8fc9f3644d558d336038c43 Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt at arm.com>
Date: Wed, 16 Oct 2024 13:58:12 +0200
Subject: [PATCH 48/82] [OpenCL] Restore addrspacecast for pipe builtins
 (#112514)

Commit 84ee629bc515 ("clang: Remove some pointer bitcasts (#112324)",
2024-10-15) triggered some "Call parameter type does not match function
signature!" errors when using the OpenCL pipe builtin functions under
the spir triple, due to a missing addrspacecast.

This would have been caught by the pipe_builtin.cl test if that had used
the `spir-unknown-unknown` triple, so extend the test to use that
triple too.
---
 clang/lib/CodeGen/CGBuiltin.cpp          |  3 ++-
 clang/test/CodeGenOpenCL/pipe_builtin.cl | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 12f99d9f1178a9..f6d7db2c204c12 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5657,13 +5657,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
             *Arg3 = EmitScalarExpr(E->getArg(3));
       llvm::FunctionType *FTy = llvm::FunctionType::get(
           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
+      Value *ACast = Builder.CreateAddrSpaceCast(Arg3, I8PTy);
       // We know the third argument is an integer type, but we may need to cast
       // it to i32.
       if (Arg2->getType() != Int32Ty)
         Arg2 = Builder.CreateZExtOrTrunc(Arg2, Int32Ty);
       return RValue::get(
           EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
-                          {Arg0, Arg1, Arg2, Arg3, PacketSize, PacketAlign}));
+                          {Arg0, Arg1, Arg2, ACast, PacketSize, PacketAlign}));
     }
   }
   // OpenCL v2.0 s6.13.16 ,s9.17.3.5 - Built-in pipe reserve read and write
diff --git a/clang/test/CodeGenOpenCL/pipe_builtin.cl b/clang/test/CodeGenOpenCL/pipe_builtin.cl
index c59f63bab6a458..ec9d7cb0450641 100644
--- a/clang/test/CodeGenOpenCL/pipe_builtin.cl
+++ b/clang/test/CodeGenOpenCL/pipe_builtin.cl
@@ -1,3 +1,4 @@
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -cl-ext=+cl_khr_subgroups -O0 -cl-std=clc++ -o - %s | FileCheck --check-prefix=CHECK-SPIR %s
 // RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -cl-ext=+cl_khr_subgroups -O0 -cl-std=clc++ -o - %s | FileCheck %s
 // FIXME: Add MS ABI manglings of OpenCL things and remove %itanium_abi_triple
 // above to support OpenCL in the MS C++ ABI.
@@ -5,65 +6,85 @@
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 
 void test1(read_only pipe int p, global int *ptr) {
+  // CHECK-SPIR: call spir_func i32 @__read_pipe_2(target("spirv.Pipe", 0) %{{.*}}, ptr addrspace(4) %{{.*}}, i32 4, i32 4)
   // CHECK: call i32 @__read_pipe_2(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
   read_pipe(p, ptr);
+  // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__reserve_read_pipe(target("spirv.Pipe", 0) %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   // CHECK: call ptr @__reserve_read_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   reserve_id_t rid = reserve_read_pipe(p, 2);
+  // CHECK-SPIR: call spir_func i32 @__read_pipe_4(target("spirv.Pipe", 0) %{{.*}}, ptr addrspace(4) %{{.*}}, i32 4, i32 4)
   // CHECK: call i32 @__read_pipe_4(ptr %{{.*}}, ptr %{{.*}}, i32 {{.*}}, ptr %{{.*}}, i32 4, i32 4)
   read_pipe(p, rid, 2, ptr);
+  // CHECK-SPIR: call spir_func void @__commit_read_pipe(target("spirv.Pipe", 0) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4)
   // CHECK: call void @__commit_read_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
   commit_read_pipe(p, rid);
 }
 
 void test2(write_only pipe int p, global int *ptr) {
+  // CHECK-SPIR: call spir_func i32 @__write_pipe_2(target("spirv.Pipe", 1) %{{.*}}, ptr addrspace(4) %{{.*}}, i32 4, i32 4)
   // CHECK: call i32 @__write_pipe_2(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
   write_pipe(p, ptr);
+  // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__reserve_write_pipe(target("spirv.Pipe", 1) %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   // CHECK: call ptr @__reserve_write_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   reserve_id_t rid = reserve_write_pipe(p, 2);
+  // CHECK-SPIR: call spir_func i32 @__write_pipe_4(target("spirv.Pipe", 1) %{{.*}}, ptr addrspace(4) %{{.*}}, i32 4, i32 4)
   // CHECK: call i32 @__write_pipe_4(ptr %{{.*}}, ptr %{{.*}}, i32 {{.*}}, ptr %{{.*}}, i32 4, i32 4)
   write_pipe(p, rid, 2, ptr);
+  // CHECK-SPIR: call spir_func void @__commit_write_pipe(target("spirv.Pipe", 1) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4)
   // CHECK: call void @__commit_write_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
   commit_write_pipe(p, rid);
 }
 
 void test3(read_only pipe int p, global int *ptr) {
+  // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__work_group_reserve_read_pipe(target("spirv.Pipe", 0) %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   // CHECK: call ptr @__work_group_reserve_read_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   reserve_id_t rid = work_group_reserve_read_pipe(p, 2);
+  // CHECK-SPIR: call spir_func void @__work_group_commit_read_pipe(target("spirv.Pipe", 0) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4)
   // CHECK: call void @__work_group_commit_read_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
   work_group_commit_read_pipe(p, rid);
 }
 
 void test4(write_only pipe int p, global int *ptr) {
+  // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__work_group_reserve_write_pipe(target("spirv.Pipe", 1) %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   // CHECK: call ptr @__work_group_reserve_write_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   reserve_id_t rid = work_group_reserve_write_pipe(p, 2);
+  // CHECK-SPIR: call spir_func void @__work_group_commit_write_pipe(target("spirv.Pipe", 1) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4)
   // CHECK: call void @__work_group_commit_write_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
   work_group_commit_write_pipe(p, rid);
 }
 
 void test5(read_only pipe int p, global int *ptr) {
+  // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__sub_group_reserve_read_pipe(target("spirv.Pipe", 0) %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   // CHECK: call ptr @__sub_group_reserve_read_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   reserve_id_t rid = sub_group_reserve_read_pipe(p, 2);
+  // CHECK-SPIR: call spir_func void @__sub_group_commit_read_pipe(target("spirv.Pipe", 0) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4)
   // CHECK: call void @__sub_group_commit_read_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
   sub_group_commit_read_pipe(p, rid);
 }
 
 void test6(write_only pipe int p, global int *ptr) {
+  // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__sub_group_reserve_write_pipe(target("spirv.Pipe", 1) %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   // CHECK: call ptr @__sub_group_reserve_write_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   reserve_id_t rid = sub_group_reserve_write_pipe(p, 2);
+  // CHECK-SPIR: call spir_func void @__sub_group_commit_write_pipe(target("spirv.Pipe", 1) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4)
   // CHECK: call void @__sub_group_commit_write_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
   sub_group_commit_write_pipe(p, rid);
 }
 
 void test7(read_only pipe int p, global int *ptr) {
+  // CHECK-SPIR: call spir_func i32 @__get_pipe_num_packets_ro(target("spirv.Pipe", 0) %{{.*}}, i32 4, i32 4)
   // CHECK: call i32 @__get_pipe_num_packets_ro(ptr %{{.*}}, i32 4, i32 4)
   *ptr = get_pipe_num_packets(p);
+  // CHECK-SPIR: call spir_func i32 @__get_pipe_max_packets_ro(target("spirv.Pipe", 0) %{{.*}}, i32 4, i32 4)
   // CHECK: call i32 @__get_pipe_max_packets_ro(ptr %{{.*}}, i32 4, i32 4)
   *ptr = get_pipe_max_packets(p);
 }
 
 void test8(write_only pipe int p, global int *ptr) {
+  // CHECK-SPIR: call spir_func i32 @__get_pipe_num_packets_wo(target("spirv.Pipe", 1) %{{.*}}, i32 4, i32 4)
   // CHECK: call i32 @__get_pipe_num_packets_wo(ptr %{{.*}}, i32 4, i32 4)
   *ptr = get_pipe_num_packets(p);
+  // CHECK-SPIR: call spir_func i32 @__get_pipe_max_packets_wo(target("spirv.Pipe", 1) %{{.*}}, i32 4, i32 4)
   // CHECK: call i32 @__get_pipe_max_packets_wo(ptr %{{.*}}, i32 4, i32 4)
   *ptr = get_pipe_max_packets(p);
 }

>From f5f00764abeb7023719d64774e263936b3f31ab7 Mon Sep 17 00:00:00 2001
From: Lewis Crawford <lcrawford at nvidia.com>
Date: Wed, 16 Oct 2024 13:23:46 +0100
Subject: [PATCH 49/82] [DAGCombiner] Fix check for extending loads (#112182)

Fix a check for extending loads in DAGCombiner,
where if the result type has more bits than the
loaded type it should count as an extending load.

All backends apart from AArch64 ignore this
ExtTy argument to shouldReduceLoadWidth, so this
change currently only impacts AArch64.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  2 +-
 .../AArch64/aarch64-scalarize-vec-load-ext.ll | 35 +++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0879165aac139d..ca91d35573c3ec 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22568,7 +22568,7 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
     return SDValue();
 
   ISD::LoadExtType ExtTy =
-      ResultVT.bitsGT(VecEltVT) ? ISD::NON_EXTLOAD : ISD::EXTLOAD;
+      ResultVT.bitsGT(VecEltVT) ? ISD::EXTLOAD : ISD::NON_EXTLOAD;
   if (!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT) ||
       !TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
     return SDValue();
diff --git a/llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll
new file mode 100644
index 00000000000000..30ce0cb09fc084
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; FIXME: Currently, we avoid narrowing this v4i32 load, in the
+; hopes of being able to fold the shift, despite it requiring stack
+; storage + loads. Ideally, we should narrow here and load the i32
+; directly from the variable offset e.g:
+;
+; add     x8, x0, x1, lsl #4
+; and     x9, x2, #0x3
+; ldr     w0, [x8, x9, lsl #2]
+;
+; The AArch64TargetLowering::shouldReduceLoadWidth heuristic should
+; probably be updated to choose load-narrowing instead of folding the
+; lsl in larger vector cases.
+;
+define i32 @narrow_load_v4_i32_single_ele_variable_idx(ptr %ptr, i64 %off, i32 %ele) {
+; CHECK-LABEL: narrow_load_v4_i32_single_ele_variable_idx:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr q0, [x0, x1, lsl #4]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
+; CHECK-NEXT:    bfi x8, x2, #2, #2
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    ldr w0, [x8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+entry:
+  %idx = getelementptr inbounds <4 x i32>, ptr %ptr, i64 %off
+  %x = load <4 x i32>, ptr %idx, align 8
+  %res = extractelement <4 x i32> %x, i32 %ele
+  ret i32 %res
+}

>From 3e31e30a844fe388e71b1dfafe836c31dd0dd4a4 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 16 Oct 2024 13:26:54 +0100
Subject: [PATCH 50/82] [X86] Add some basic test coverage for #112425

---
 llvm/test/CodeGen/X86/andnot-patterns.ll | 652 +++++++++++++++++++++++
 1 file changed, 652 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/andnot-patterns.ll

diff --git a/llvm/test/CodeGen/X86/andnot-patterns.ll b/llvm/test/CodeGen/X86/andnot-patterns.ll
new file mode 100644
index 00000000000000..0ff4e3b47ae46a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/andnot-patterns.ll
@@ -0,0 +1,652 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-- -mattr=+bmi | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+bmi | FileCheck %s --check-prefixes=X64
+
+; TODO - PR112425 - attempt to reconstruct andnot patterns through bitwise-agnostic operations
+
+declare void @use_i64(i64)
+
+;
+; Fold (and X, (rotl (not Y), Z))) -> (and X, (not (rotl Y, Z)))
+;
+
+define i64 @andnot_rotl_i64(i64 %a0, i64 %a1) nounwind {
+; X86-LABEL: andnot_rotl_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    jne .LBB0_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    jmp .LBB0_3
+; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:  .LBB0_3:
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shldl %cl, %ebx, %eax
+; X86-NEXT:    andl %edi, %eax
+; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_rotl_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    notq %rax
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NEXT:    rolq %cl, %rax
+; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    retq
+  %not = xor i64 %a0, -1
+  %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a1)
+  %and = and i64 %rot, %a0
+  ret i64 %and
+}
+
+define i32 @andnot_rotl_i32(i32 %a0, i32 %a1) nounwind {
+; X86-LABEL: andnot_rotl_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    roll %cl, %eax
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_rotl_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notl %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    roll %cl, %eax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    retq
+  %not = xor i32 %a0, -1
+  %rot = tail call i32 @llvm.fshl.i32(i32 %not, i32 %not, i32 %a1)
+  %and = and i32 %rot, %a0
+  ret i32 %and
+}
+
+define i16 @andnot_rotl_i16(i16 %a0, i16 %a1) nounwind {
+; X86-LABEL: andnot_rotl_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    rolw %cl, %ax
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_rotl_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notl %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rolw %cl, %ax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %not = xor i16 %a0, -1
+  %rot = tail call i16 @llvm.fshl.i16(i16 %not, i16 %not, i16 %a1)
+  %and = and i16 %rot, %a0
+  ret i16 %and
+}
+
+define i8 @andnot_rotl_i8(i8 %a0, i8 %a1) nounwind {
+; X86-LABEL: andnot_rotl_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    notb %al
+; X86-NEXT:    rolb %cl, %al
+; X86-NEXT:    andb %dl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_rotl_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notb %al
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rolb %cl, %al
+; X64-NEXT:    andb %dil, %al
+; X64-NEXT:    retq
+  %not = xor i8 %a0, -1
+  %rot = tail call i8 @llvm.fshl.i8(i8 %not, i8 %not, i8 %a1)
+  %and = and i8 %rot, %a0
+  ret i8 %and
+}
+
+define i64 @andnot_rotl_i64_multiuse(i64 %a0, i64 %a1) nounwind {
+; X86-LABEL: andnot_rotl_i64_multiuse:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    jne .LBB4_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    jmp .LBB4_3
+; X86-NEXT:  .LBB4_1:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:  .LBB4_3:
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    shldl %cl, %edx, %ebx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    andl %edx, %esi
+; X86-NEXT:    andl %ebx, %edi
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    calll use_i64 at PLT
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_rotl_i64_multiuse:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    notq %rdi
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NEXT:    rolq %cl, %rdi
+; X64-NEXT:    andq %rdi, %rbx
+; X64-NEXT:    callq use_i64 at PLT
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+  %not = xor i64 %a0, -1
+  %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a1)
+  %and = and i64 %rot, %a0
+  call void @use_i64(i64 %rot)
+  ret i64 %and
+}
+
+;
+; Fold (and X, (rotr (not Y), Z))) -> (and X, (not (rotr Y, Z)))
+;
+
+define i64 @andnot_rotr_i64(i64 %a0, i64 %a1) nounwind {
+; X86-LABEL: andnot_rotr_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    jne .LBB5_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    jmp .LBB5_3
+; X86-NEXT:  .LBB5_1:
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:  .LBB5_3:
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shldl %cl, %ebx, %eax
+; X86-NEXT:    andl %edi, %eax
+; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_rotr_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    notq %rax
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NEXT:    rolq %cl, %rax
+; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    retq
+  %not = xor i64 %a0, -1
+  %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a1)
+  %and = and i64 %rot, %a0
+  ret i64 %and
+}
+
+define i32 @andnot_rotr_i32(i32 %a0, i32 %a1) nounwind {
+; X86-LABEL: andnot_rotr_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    roll %cl, %eax
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_rotr_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notl %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    roll %cl, %eax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    retq
+  %not = xor i32 %a0, -1
+  %rot = tail call i32 @llvm.fshl.i32(i32 %not, i32 %not, i32 %a1)
+  %and = and i32 %rot, %a0
+  ret i32 %and
+}
+
+define i16 @andnot_rotr_i16(i16 %a0, i16 %a1) nounwind {
+; X86-LABEL: andnot_rotr_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    rolw %cl, %ax
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_rotr_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notl %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rolw %cl, %ax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %not = xor i16 %a0, -1
+  %rot = tail call i16 @llvm.fshl.i16(i16 %not, i16 %not, i16 %a1)
+  %and = and i16 %rot, %a0
+  ret i16 %and
+}
+
+define i8 @andnot_rotr_i8(i8 %a0, i8 %a1) nounwind {
+; X86-LABEL: andnot_rotr_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    notb %al
+; X86-NEXT:    rolb %cl, %al
+; X86-NEXT:    andb %dl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_rotr_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notb %al
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rolb %cl, %al
+; X64-NEXT:    andb %dil, %al
+; X64-NEXT:    retq
+  %not = xor i8 %a0, -1
+  %rot = tail call i8 @llvm.fshl.i8(i8 %not, i8 %not, i8 %a1)
+  %and = and i8 %rot, %a0
+  ret i8 %and
+}
+
+;
+; Fold (and X, (bswap (not Y)))) -> (and X, (not (bswap Y)))
+;
+
+define i64 @andnot_bswap_i64(i64 %a0) nounwind {
+; X86-LABEL: andnot_bswap_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    notl %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_bswap_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    notq %rax
+; X64-NEXT:    bswapq %rax
+; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    retq
+  %not = xor i64 %a0, -1
+  %bswap = tail call i64 @llvm.bswap.i64(i64 %not)
+  %and = and i64 %bswap, %a0
+  ret i64 %and
+}
+
+define i32 @andnot_bswap_i32(i32 %a0) nounwind {
+; X86-LABEL: andnot_bswap_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_bswap_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notl %eax
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    retq
+  %not = xor i32 %a0, -1
+  %bswap = tail call i32 @llvm.bswap.i32(i32 %not)
+  %and = and i32 %bswap, %a0
+  ret i32 %and
+}
+
+define i16 @andnot_bswap_i16(i16 %a0) nounwind {
+; X86-LABEL: andnot_bswap_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_bswap_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notl %eax
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %not = xor i16 %a0, -1
+  %bswap = tail call i16 @llvm.bswap.i16(i16 %not)
+  %and = and i16 %bswap, %a0
+  ret i16 %and
+}
+
+;
+; Fold (and X, (bitreverse (not Y)))) -> (and X, (not (bitreverse Y)))
+;
+
+define i64 @andnot_bitreverse_i64(i64 %a0) nounwind {
+; X86-LABEL: andnot_bitreverse_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,2), %edx
+; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    notl %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NEXT:    leal (%eax,%esi,4), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; X86-NEXT:    leal (%eax,%esi,2), %eax
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_bitreverse_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    notq %rax
+; X64-NEXT:    bswapq %rax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    shrq $4, %rcx
+; X64-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT:    andq %rdx, %rcx
+; X64-NEXT:    andq %rdx, %rax
+; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    orq %rcx, %rax
+; X64-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    andq %rcx, %rdx
+; X64-NEXT:    shrq $2, %rax
+; X64-NEXT:    andq %rcx, %rax
+; X64-NEXT:    leaq (%rax,%rdx,4), %rax
+; X64-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    andq %rcx, %rdx
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    andq %rcx, %rax
+; X64-NEXT:    leaq (%rax,%rdx,2), %rax
+; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    retq
+  %not = xor i64 %a0, -1
+  %bitrev = tail call i64 @llvm.bitreverse.i64(i64 %not)
+  %and = and i64 %bitrev, %a0
+  ret i64 %and
+}
+
+define i32 @andnot_bitreverse_i32(i32 %a0) nounwind {
+; X86-LABEL: andnot_bitreverse_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NEXT:    leal (%eax,%edx,4), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; X86-NEXT:    leal (%eax,%edx,2), %eax
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_bitreverse_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notl %eax
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X64-NEXT:    shll $4, %ecx
+; X64-NEXT:    shrl $4, %eax
+; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X64-NEXT:    shrl $2, %eax
+; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X64-NEXT:    leal (%rax,%rcx,4), %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; X64-NEXT:    leal (%rax,%rcx,2), %eax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    retq
+  %not = xor i32 %a0, -1
+  %bitrev = tail call i32 @llvm.bitreverse.i32(i32 %not)
+  %and = and i32 %bitrev, %a0
+  ret i32 %and
+}
+
+define i16 @andnot_bitreverse_i16(i16 %a0) nounwind {
+; X86-LABEL: andnot_bitreverse_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $3855, %edx # imm = 0xF0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $3855, %eax # imm = 0xF0F
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $13107, %edx # imm = 0x3333
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $13107, %eax # imm = 0x3333
+; X86-NEXT:    leal (%eax,%edx,4), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $21845, %edx # imm = 0x5555
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $21845, %eax # imm = 0x5555
+; X86-NEXT:    leal (%eax,%edx,2), %eax
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_bitreverse_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notl %eax
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andl $3855, %ecx # imm = 0xF0F
+; X64-NEXT:    shll $4, %ecx
+; X64-NEXT:    shrl $4, %eax
+; X64-NEXT:    andl $3855, %eax # imm = 0xF0F
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andl $13107, %ecx # imm = 0x3333
+; X64-NEXT:    shrl $2, %eax
+; X64-NEXT:    andl $13107, %eax # imm = 0x3333
+; X64-NEXT:    leal (%rax,%rcx,4), %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andl $21845, %ecx # imm = 0x5555
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    andl $21845, %eax # imm = 0x5555
+; X64-NEXT:    leal (%rax,%rcx,2), %eax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %not = xor i16 %a0, -1
+  %bitrev = tail call i16 @llvm.bitreverse.i16(i16 %not)
+  %and = and i16 %bitrev, %a0
+  ret i16 %and
+}
+
+define i8 @andnot_bitreverse_i8(i8 %a0) nounwind {
+; X86-LABEL: andnot_bitreverse_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    notb %al
+; X86-NEXT:    rolb $4, %al
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andb $51, %dl
+; X86-NEXT:    shlb $2, %dl
+; X86-NEXT:    shrb $2, %al
+; X86-NEXT:    andb $51, %al
+; X86-NEXT:    orb %dl, %al
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andb $85, %dl
+; X86-NEXT:    addb %dl, %dl
+; X86-NEXT:    shrb %al
+; X86-NEXT:    andb $85, %al
+; X86-NEXT:    orb %dl, %al
+; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_bitreverse_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notb %al
+; X64-NEXT:    rolb $4, %al
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andb $51, %cl
+; X64-NEXT:    shlb $2, %cl
+; X64-NEXT:    shrb $2, %al
+; X64-NEXT:    andb $51, %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andb $85, %cl
+; X64-NEXT:    addb %cl, %cl
+; X64-NEXT:    shrb %al
+; X64-NEXT:    andb $85, %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    andb %dil, %al
+; X64-NEXT:    retq
+  %not = xor i8 %a0, -1
+  %bitrev = tail call i8 @llvm.bitreverse.i8(i8 %not)
+  %and = and i8 %bitrev, %a0
+  ret i8 %and
+}

>From 11ed7f2d3cbe9a22be2edb67881efd76fb8bba17 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Wed, 16 Oct 2024 09:55:37 +0100
Subject: [PATCH 51/82] [AArch64] NFC: Regenerate aarch64-sve-asm.ll

It should use update_mir_test_checks.py instead of
update_llc_test_checks.py.
---
 llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll | 129 ++++++++++++-------
 1 file changed, 83 insertions(+), 46 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll
index 4ca2fb8815797f..068e194779c153 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll
@@ -1,84 +1,121 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sve -stop-after=finalize-isel | FileCheck %s --check-prefix=CHECK
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sve2p1 -stop-after=finalize-isel | FileCheck %s --check-prefix=CHECK
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-linux-gnu"
 
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
-; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG3:%[0-9]+]]:zpr = COPY [[ARG2]]
-; CHECK: [[ARG4:%[0-9]+]]:zpr_3b = COPY [[ARG1]]
-; CHECK: INLINEASM {{.*}} [[ARG4]]
 define <vscale x 16 x i8> @test_svadd_i8(<vscale x 16 x i8> %Zn, <vscale x 16 x i8> %Zm) {
+  ; CHECK-LABEL: name: test_svadd_i8
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:zpr = COPY $z1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:zpr_3b = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"add $0.b, $1.b, $2.b", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 5046281 /* reguse:ZPR */, [[COPY2]], 5373961 /* reguse:ZPR_3b */, [[COPY3]]
+  ; CHECK-NEXT:   $z0 = COPY %2
+  ; CHECK-NEXT:   RET_ReallyLR implicit $z0
   %1 = tail call <vscale x 16 x i8> asm "add $0.b, $1.b, $2.b", "=w,w,y"(<vscale x 16 x i8> %Zn, <vscale x 16 x i8> %Zm)
   ret <vscale x 16 x i8> %1
 }
 
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
-; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG3:%[0-9]+]]:zpr = COPY [[ARG2]]
-; CHECK: [[ARG4:%[0-9]+]]:zpr_4b = COPY [[ARG1]]
-; CHECK: INLINEASM {{.*}} [[ARG4]]
 define <vscale x 2 x i64> @test_svsub_i64(<vscale x 2 x i64> %Zn, <vscale x 2 x i64> %Zm) {
+  ; CHECK-LABEL: name: test_svsub_i64
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:zpr = COPY $z1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:zpr_4b = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"sub $0.d, $1.d, $2.d", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 5046281 /* reguse:ZPR */, [[COPY2]], 5242889 /* reguse:ZPR_4b */, [[COPY3]]
+  ; CHECK-NEXT:   $z0 = COPY %2
+  ; CHECK-NEXT:   RET_ReallyLR implicit $z0
   %1 = tail call <vscale x 2 x i64> asm "sub $0.d, $1.d, $2.d", "=w,w,x"(<vscale x 2 x i64> %Zn, <vscale x 2 x i64> %Zm)
   ret <vscale x 2 x i64> %1
 }
 
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
-; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG3:%[0-9]+]]:zpr = COPY [[ARG2]]
-; CHECK: [[ARG4:%[0-9]+]]:zpr_3b = COPY [[ARG1]]
-; CHECK: INLINEASM {{.*}} [[ARG4]]
 define <vscale x 8 x half> @test_svfmul_f16(<vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm) {
+  ; CHECK-LABEL: name: test_svfmul_f16
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:zpr = COPY $z1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:zpr_3b = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"fmul $0.h, $1.h, $2.h", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 5046281 /* reguse:ZPR */, [[COPY2]], 5373961 /* reguse:ZPR_3b */, [[COPY3]]
+  ; CHECK-NEXT:   $z0 = COPY %2
+  ; CHECK-NEXT:   RET_ReallyLR implicit $z0
   %1 = tail call <vscale x 8 x half> asm "fmul $0.h, $1.h, $2.h", "=w,w,y"(<vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm)
   ret <vscale x 8 x half> %1
 }
 
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
-; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG3:%[0-9]+]]:zpr = COPY [[ARG2]]
-; CHECK: [[ARG4:%[0-9]+]]:zpr_4b = COPY [[ARG1]]
-; CHECK: INLINEASM {{.*}} [[ARG4]]
 define <vscale x 4 x float> @test_svfmul_f(<vscale x 4 x float> %Zn, <vscale x 4 x float> %Zm) {
+  ; CHECK-LABEL: name: test_svfmul_f
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:zpr = COPY $z1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:zpr_4b = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"fmul $0.s, $1.s, $2.s", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 5046281 /* reguse:ZPR */, [[COPY2]], 5242889 /* reguse:ZPR_4b */, [[COPY3]]
+  ; CHECK-NEXT:   $z0 = COPY %2
+  ; CHECK-NEXT:   RET_ReallyLR implicit $z0
   %1 = tail call <vscale x 4 x float> asm "fmul $0.s, $1.s, $2.s", "=w,w,x"(<vscale x 4 x float> %Zn, <vscale x 4 x float> %Zm)
   ret <vscale x 4 x float> %1
 }
 
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
-; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG3:%[0-9]+]]:ppr = COPY $p0
-; CHECK: [[ARG4:%[0-9]+]]:ppr_3b = COPY [[ARG3]]
-; CHECK: INLINEASM {{.*}} [[ARG4]]
 define <vscale x 8 x half> @test_svfadd_f16(<vscale x 16 x i1> %Pg, <vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm) {
+  ; CHECK-LABEL: name: test_svfadd_f16
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $p0, $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:zpr = COPY $z1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ppr = COPY $p0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ppr_3b = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:zpr = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:zpr = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"fadd $0.h, $1/m, $2.h, $3.h", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %3, 589833 /* reguse:PPR_3b */, [[COPY3]], 5046281 /* reguse:ZPR */, [[COPY4]], 5046281 /* reguse:ZPR */, [[COPY5]]
+  ; CHECK-NEXT:   $z0 = COPY %3
+  ; CHECK-NEXT:   RET_ReallyLR implicit $z0
   %1 = tail call <vscale x 8 x half> asm "fadd $0.h, $1/m, $2.h, $3.h", "=w, at 3Upl,w,w"(<vscale x 16 x i1> %Pg, <vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm)
   ret <vscale x 8 x half> %1
 }
 
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG2:%[0-9]+]]:ppr = COPY $p0
-; CHECK: [[ARG3:%[0-9]+]]:ppr = COPY [[ARG2]]
-; CHECK: [[ARG4:%[0-9]+]]:zpr = COPY [[ARG1]]
-; CHECK: INLINEASM {{.*}} [[ARG3]]
 define <vscale x 4 x i32> @test_incp(<vscale x 16 x i1> %Pg, <vscale x 4 x i32> %Zn) {
+  ; CHECK-LABEL: name: test_incp
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $p0, $z0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:ppr = COPY $p0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ppr = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:zpr = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"incp $0.s, $1", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 393225 /* reguse:PPR */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3)
+  ; CHECK-NEXT:   $z0 = COPY %2
+  ; CHECK-NEXT:   RET_ReallyLR implicit $z0
   %1 = tail call <vscale x 4 x i32> asm "incp $0.s, $1", "=w, at 3Upa,0"(<vscale x 16 x i1> %Pg, <vscale x 4 x i32> %Zn)
   ret <vscale x 4 x i32> %1
 }
 
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
-; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG3:%[0-9]+]]:ppr = COPY $p0
-; CHECK: [[ARG4:%[0-9]+]]:ppr_p8to15 = COPY [[ARG3]]
-; CHECK: INLINEASM {{.*}} [[ARG4]]
 define <vscale x 8 x half> @test_svfadd_f16_Uph_constraint(<vscale x 16 x i1> %Pg, <vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm) {
+  ; CHECK-LABEL: name: test_svfadd_f16_Uph_constraint
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $p0, $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:zpr = COPY $z1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ppr = COPY $p0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ppr_p8to15 = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:zpr = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:zpr = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"fadd $0.h, $1/m, $2.h, $3.h", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %3, 655369 /* reguse:PPR_p8to15 */, [[COPY3]], 5046281 /* reguse:ZPR */, [[COPY4]], 5046281 /* reguse:ZPR */, [[COPY5]]
+  ; CHECK-NEXT:   $z0 = COPY %3
+  ; CHECK-NEXT:   RET_ReallyLR implicit $z0
   %1 = tail call <vscale x 8 x half> asm "fadd $0.h, $1/m, $2.h, $3.h", "=w, at 3Uph,w,w"(<vscale x 16 x i1> %Pg, <vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm)
   ret <vscale x 8 x half> %1
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}

>From 9ad4f05ef7491cff73e7f574fed717531974fc6b Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra at googlemail.com>
Date: Wed, 16 Oct 2024 14:40:51 +0200
Subject: [PATCH 52/82] [bazel][lldb] Port
 5f2cf99e146ce99d4e148038d9bdd012331b4821

---
 .../lldb/source/Plugins/BUILD.bazel           | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
index 38493411addebf..7057f5d5c5c13a 100644
--- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
@@ -1221,12 +1221,31 @@ cc_library(
     ],
 )
 
+gentbl_cc_library(
+    name = "DynamicLoaderMacOSXDYLDProperties",
+    strip_include_prefix = "DynamicLoader/MacOSX-DYLD",
+    tbl_outs = [
+        (
+            ["-gen-lldb-property-defs"],
+            "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.inc",
+        ),
+        (
+            ["-gen-lldb-property-enum-defs"],
+            "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinPropertiesEnum.inc",
+        ),
+    ],
+    tblgen = "//lldb:lldb-tblgen",
+    td_file = "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.td",
+    deps = ["//lldb:CoreTdFiles"],
+)
+
 cc_library(
     name = "PluginDynamicLoaderMacOSXDYLD",
     srcs = glob(["DynamicLoader/MacOSX-DYLD/*.cpp"]),
     hdrs = glob(["DynamicLoader/MacOSX-DYLD/*.h"]),
     include_prefix = "Plugins",
     deps = [
+        ":DynamicLoaderMacOSXDYLDProperties",
         ":PluginObjCRuntime",
         ":PluginTypeSystemClang",
         ":PluginTypeSystemClangHeaders",
@@ -1239,6 +1258,7 @@ cc_library(
         "//lldb:Target",
         "//lldb:TargetHeaders",
         "//lldb:Utility",
+        "//llvm:Support",
         "//llvm:TargetParser",
     ],
 )

>From 157f10ddf2d851125a85a71e530dc9d50cb032a2 Mon Sep 17 00:00:00 2001
From: Yuta Saito <kateinoigakukun at gmail.com>
Date: Wed, 16 Oct 2024 21:42:54 +0900
Subject: [PATCH 53/82] Revert "[llvm-cov][WebAssembly] Read `__llvm_prf_names`
 from data segments" (#112520)

This reverts commit efc9dd4118a7ada7d8c898582f16db64827f7ce0 in order to
fix Windows test failure:
https://github.com/llvm/llvm-project/pull/111332#issuecomment-2416462512
---
 .../Coverage/CoverageMappingReader.cpp        |  64 +++---------------
 .../llvm-cov/Inputs/binary-formats.v6.wasm32  | Bin 87781 -> 0 bytes
 .../Inputs/binary-formats.wasm.proftext       |   4 --
 llvm/test/tools/llvm-cov/binary-formats.c     |   7 --
 4 files changed, 9 insertions(+), 66 deletions(-)
 delete mode 100755 llvm/test/tools/llvm-cov/Inputs/binary-formats.v6.wasm32
 delete mode 100644 llvm/test/tools/llvm-cov/Inputs/binary-formats.wasm.proftext

diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index 461fc43d32f8df..8881bffe41c57c 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -18,14 +18,12 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/Error.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/ObjectFile.h"
-#include "llvm/Object/Wasm.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compression.h"
@@ -1079,53 +1077,6 @@ lookupSections(ObjectFile &OF, InstrProfSectKind IPSK) {
   return Sections;
 }
 
-/// Find a section that matches \p Name and is allocatable at runtime.
-///
-/// Returns the contents of the section and its start offset in the object file.
-static Expected<std::pair<StringRef, uint64_t>>
-lookupAllocatableSection(ObjectFile &OF, InstrProfSectKind IPSK) {
-  // On Wasm, allocatable sections can live only in data segments.
-  if (auto *WOF = dyn_cast<WasmObjectFile>(&OF)) {
-    std::vector<const WasmSegment *> Segments;
-    auto ObjFormat = OF.getTripleObjectFormat();
-    auto Name =
-        getInstrProfSectionName(IPSK, ObjFormat, /*AddSegmentInfo=*/false);
-    for (const auto &DebugName : WOF->debugNames()) {
-      if (DebugName.Type != wasm::NameType::DATA_SEGMENT ||
-          DebugName.Name != Name)
-        continue;
-      if (DebugName.Index >= WOF->dataSegments().size())
-        return make_error<CoverageMapError>(coveragemap_error::malformed);
-      auto &Segment = WOF->dataSegments()[DebugName.Index];
-      Segments.push_back(&Segment);
-    }
-    if (Segments.empty())
-      return make_error<CoverageMapError>(coveragemap_error::no_data_found);
-    if (Segments.size() != 1)
-      return make_error<CoverageMapError>(coveragemap_error::malformed);
-
-    const auto &Segment = *Segments.front();
-    auto &Data = Segment.Data;
-    StringRef Content(reinterpret_cast<const char *>(Data.Content.data()),
-                      Data.Content.size());
-    return std::make_pair(Content, Segment.SectionOffset);
-  }
-
-  // On other object file types, delegate to lookupSections to find the section.
-  auto Sections = lookupSections(OF, IPSK);
-  if (!Sections)
-    return Sections.takeError();
-  if (Sections->size() != 1)
-    return make_error<CoverageMapError>(
-        coveragemap_error::malformed,
-        "the size of coverage mapping section is not one");
-  auto &Section = Sections->front();
-  auto ContentsOrErr = Section.getContents();
-  if (!ContentsOrErr)
-    return ContentsOrErr.takeError();
-  return std::make_pair(*ContentsOrErr, Section.getAddress());
-}
-
 static Expected<std::unique_ptr<BinaryCoverageReader>>
 loadBinaryFormat(std::unique_ptr<Binary> Bin, StringRef Arch,
                  StringRef CompilationDir = "",
@@ -1156,20 +1107,23 @@ loadBinaryFormat(std::unique_ptr<Binary> Bin, StringRef Arch,
 
   // Look for the sections that we are interested in.
   auto ProfileNames = std::make_unique<InstrProfSymtab>();
+  std::vector<SectionRef> NamesSectionRefs;
   // If IPSK_name is not found, fallback to search for IPK_covname, which is
   // used when binary correlation is enabled.
-  auto NamesSection = lookupAllocatableSection(*OF, IPSK_name);
+  auto NamesSection = lookupSections(*OF, IPSK_name);
   if (auto E = NamesSection.takeError()) {
     consumeError(std::move(E));
-    NamesSection = lookupAllocatableSection(*OF, IPSK_covname);
+    NamesSection = lookupSections(*OF, IPSK_covname);
     if (auto E = NamesSection.takeError())
       return std::move(E);
   }
+  NamesSectionRefs = *NamesSection;
 
-  uint64_t NamesAddress;
-  StringRef NamesContent;
-  std::tie(NamesContent, NamesAddress) = *NamesSection;
-  if (Error E = ProfileNames->create(NamesContent, NamesAddress))
+  if (NamesSectionRefs.size() != 1)
+    return make_error<CoverageMapError>(
+        coveragemap_error::malformed,
+        "the size of coverage mapping section is not one");
+  if (Error E = ProfileNames->create(NamesSectionRefs.back()))
     return std::move(E);
 
   auto CoverageSection = lookupSections(*OF, IPSK_covmap);
diff --git a/llvm/test/tools/llvm-cov/Inputs/binary-formats.v6.wasm32 b/llvm/test/tools/llvm-cov/Inputs/binary-formats.v6.wasm32
deleted file mode 100755
index 5a606d5a2f69fd150690bd17ebb9d922cc17c383..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 87781
zcmd4437B0~mG^zmxwod=O6nx3kV;Y{dsiw at AqfNsNeBUQLa0hem?D#k5{5vy6(E;X
z7&Ihd6cuDp5fu~^2?9}^XGKNDc4$<z>27Ut=oZ_d(bj&Qx_N*9wfDJIl>vR<=X<{I
z(<0}dbN0CQ+H0---)pZ^I(PFWNvTvy{wbNhF4<mMSKhvTd$OIM{FB<x62F>wYBxM8
ztt%<nw(ZF_Qf!NFx!u055>r)ptNS*~wk_P0NS`Qqvpo&j^>6bwrJ!8iZVP3UV%DNO
z-ft_#j4DE{jExAj>S3!=$I8;J+mo%a*`@Thrt$k-dG6+m&)IzGxtlg`xNPX0O_#5~
z;^OsJ9@={D<ri%}=c4sPrSd*fjI$J*FMief7f)ARf9Vw$Uw+x8FIw36BGF$jul2$U
z&N+YMWt-QR(tWgSEGaI$VDr$qLoZtA#NlL{*AJa@;l^_>+Pu%+b}I438`oQ at O0I^M
z+V;@`wGm35dqHW;K2nSeDgLHLQj(>+;Nr{Ax%Avi)|Zlf6xu?eR%`V|5^P?-ez4TO
zj|6J?m6u;Uw7#^jA|`D*cWA>o=Tptl`g3T}`tye_yZq{XHmS{$U$$xerKPd^sIzU;
z<(Hj*&ibn^9x9c}KS_R+v^2GpE5-lP%D9ekm2uhl at e{^Z##bgbCuwIXNs~#{q%y9p
zElta*{@0RL_N$c3Nu{Y=Zk}9jPPi$zRJzJ7X$$|Wv{cecIc=#_%2UWfMLa51l8`xV
zuC$~n56jJMQ%lNOPRr%CG3BJ%UM){6ku0IKmU1<rv@}s0TGCPr84~JGlBBgYNlI!*
zIUz}!R4K5Ul)KAGQ`i12WFrklP at S%oressH-+oPH at -%OnxIM{l|4XI1y_A>q&tvoT
z at 3SNR{jL3e$o>s)EusDM%)v^X=$~I6)cE<tf7;6j{%DtRdj5EL)KP7jm^NRs{t`wl
zEj6D5XI(y2n%LHpsFX>EoN&x>t4}=X<Wtt1KD7SSo0E?xUrN4`d@*@E`AIV7=U=V-
zD5;dPtEy12x;9C+r_0i1Gq2sTV at E09@d5s=^`)h;rBP4vWMNv$?ogWSc9NESDf@)3
zt~563s8WzCW%uhrsru4#WqWyBx_u_8NtouDFT2#02Ytm=Txnt2&V}+?^OA3JO|#1n
zJ*ITc%M;(?nqA9QuQzS3&9yEH at 3<Y~#&lHoOVaK9rqaIjvEM6G;;6lYKFqw%D&tkW
zzkav-8+N<Daku-McDsMgZud9ucK_Pp`&mh)=xjn~Q<u1OkUp2Kvox4~cT|-szd`i%
zk+y2RE7i!5LUCp2s7_3hZOgBH&s+ce;2ZAw=yr9rqx#Bnd3(#Y^1kX=RBm<U+2vMW
zS>6IcE3WB!h}jA;A?G!|m8X!>wOubF?)26Cg~YeJs%x+Nv2JWfwJSv*ZkzeKqC>OG
zk*-%K<G1HGzgUYy-+WWL%$1KHTgq>-#La<qLIY~P)wQa7Z6qG!+I-cG3Dpm%df!xa
zO at 3US_zpMDb<}<4vX1Jvl5z)aYVc4gcMz-Ol0gxxmzKhZcUgmtwHBt`*<0+k0;g3L
zrd`?F?6!&9riE!|_IA5%=C*lZnq_y}Z40+8 at MHE?yKUvR6~5GPbVD;PT{M|XC#~j^
z(Qq#9G at MJT9|w<)5hdMNH!gdR71BXdGYZOTe!LqWn(8__s_!eewON<ct;)7>+s6)f
zuQ`9J6nKA7)QXu}#Xf5oIp)erF6qK_1()(dsNJegF6Bb!7K^@%HO77EQjyq=8Dx}8
zfyzTh<#BG at E-2jUIu@1ZE8!X%$z>&t%-s0wL2LU25=>ZB?$etUexfeRe5ae}I<uXY
zU=j%?Eh;<LI=kHG_tVAs$-2z<U3r(E!uu&TKh;h3)0hd}+;-Re{%(IiJ#Y0hxSdh+
z&N=Vher~esa?{*&$n}s4r@=k0CwqrgG&6su<a^yr*IV<m+$?{9J7AYTFjU*;4s?B6
z{cPQJ`GeeSchFWpC$wd*o8#tg_49O>`Geg&ckot!i0<b5`R))mf2%)Kcdh;~cc?pT
zt3O<%UqHha)cg_dh)DXwJE|MYP5YK&G1(|-<|^V0=ASsD*)_T5?7h~PmgRknYKv>#
z#vI5Ppo$w)gQ2Ril(vrQHBC*%8|g4lj24 at JV}*}|3bTjp3M;`CZM&$l#f{kpPl$ml
zuDvFnh(%UAs?VhzP1{ohj11T$UMu`g?Z1ZLY0l^Gl}n9dTw;9T(naMi$O)Hra)C?X
zGjVBq_NX<gP1?q_jV+10I{ldCB}oKVT~y|Y*s0yMXZKsuv658dG`H&TZpW^`vYF<U
z%VPR2Kfz6i=_e|^v1{E=a+7w6K{Y0mVYU58-=0)eMW63l4tGs~{ibXcuV%Zfx at n<B
z-LBhBYqSR5jjyNcbp{j9aMK%=!^`ot4|T!9&hxqot!C2mnKjtBH_*DHdSjAGuKiWb
z*x)5m4($<Gsv_A{(jC$wk4#mAsglO5*UL(n_=A2dX^ecAihPo+yF|N=>OEL7+ncww
z78<1lW9%Vb-(^H8FXv+a%TQxwSGnFdF=X(!SRwHB+TkYDebvUdDs3P&le%`s9?CQS
zk(zEScP2~Q5&9YJN?k`yO^fU}uA}<!Zo>cnU?%LFT#YHgJbW>d#dLd+jkZo;>P)C1
z7AMBGLdhzo2ZGDg^v{xJO`#^EY*R7F3Ho59MX;!xNn1nBs4bhKwrs}sX{tfh7LD(S
zxr?~CjyY1+9EqB~ogo<OAR!9W^yV2pgWM*|>wba_!GvLjJTX?$>6jfg%?(wtpW9DU
zW0&s==7UX$YnTvaH<d}um{H3#?wAc-)Gukv<mjlLnzR(0GkR=B8h4R}XTBchF at 9+<
zuPhVTv|W9a`o%VSZ6#S&6j_E~#%Y5H)22x3CKcW7nxqT6xS$I=U6a&wb6ycAx)vE4
zP14`(3`iS4+mJo!vhp&eli1}VJ#N;AsOQG6+RgAder%#tT at cdpOrY$EZi1UQgjgBx
zIwR>Skj|1#4#~PmX2ctUc$3}K8dRI6Dcg;f>)wd|YjXR`uCPqgL#7#Ia>x=lgfc`4
z)};M3EHi07n$OR2y{Nre?tlS*AY*G<ukL5t*v at 8bRZKhT?;vHG6Uv?&%AQBrru&A_
zeRJI*HFV#6B|B8_rTgYMv|si!8SGOf7OiE6hujOueS}-!ju`R_-QjLw%`b9`RM%nz
z_hM?UTKZOhq|&$hC2onzKFS^Cj;#5mZi!neF~z&gA5F`TaYwsjYW`TLa4bX8PJX($
zjA71sl-K-nx4c&(rsS4&RBua@{R>tYZB55C(W-uvnVOZz*>H-h)DTO}YDx=C3uCqB
zcUvOk3nsm5)5|fu93!b(7Ac~gRLmwfZoqfY=FD}ttnSCt<MBwc`>dP^u`(op1VJYS
zb*h#fkC&KJWjBdZnJ1#jexdZqx>PWY?)weh^HX90=p30za7i$OGK$9llEGAt22<?p
zw2taql4NW_kP2zc!XH#u!*px5wG?;E2{eQ>Z%Jg(G~?2kX7;eXFv9~0cwb>kSTgjk
ztBOmxlXS6%*o`K#!(9jCW|UzVJF1VQi7y)VA|n+DVoI!HD}^QrrrV98`&cQ2EUZ at 4
z5)38RHs1IHtr~{CJrWW#M%E0iRLa;qi9v+*csF6dPh>0$i-575G+gz5u?RB`YRJGY
z6)`1NFmfK3B=y{s*aS$hKjX%HE=Bg5&YYiKGs8enWo0?W(bbsPK8#~W^|O^EFNjsf
zHrTs_>w}GvaJ^g at O{Y2Jq-*=qK1LJ8kRjxnO at oDpMx%CU)F!1Lq}u9HQqAN=7Ue}B
zv>Bg$SllKp;Y>$tN5(a|i3`)Y?P-|Wz0j;lye*|NiJx$`iCZVh=k`MeLk6Uun`}~9
ztwbuTl~ddl@!V9`1$$40a?@P*YQI0_On3Xc>2*KD8Z at KPAm^M252<wN38l<*Gj-Ps
zDQ8j3teR&55XE4x!~AzW<B?rEN+k(ICvli>M;11 at gVc?3HOjO;1mhe{1rv;YtPAFe
zh6f`Q53V5_o7^Gj3)R`~=OgxzkM7VK^6 at Z#IHGTU_P#hV7w{si5d1>YE#h`j%`bL~
z{gHHSi96CQbW5`LEA><r*NQkadMtHIb$2wv@#sNBp*=j-9qW#Px+VuHY8j0-nL5|!
z at Wyf;EwB0G+;P1WxTECqj_T{uWNN`bqfLsK#>66;T<Xgv8<N6Et?3pXn{<>|FzHyL
z;g}QBm?jImf`F8!Gyy3P#;gue$cjZmXjGVVx{M1Cu`G3mQRBx`;sh-xQJLe4 at tMf@
zObp`_1!gyF7V=J_iv8RqF_h5)acLZ7ue%YO4t^@f6=m#aS4Z`~lVnmsi+#@}hFo(=
zQ;NCN8{q<5m@%~?T+qT>G(~rW3rtf8JqU0?v?4kqT!0B7v6)4<fDx0xH1?Ux^kZlp
z#NpBnakzA0n{nxcI9xRMxJWc{Y1gFdOEuFBJ&?XJ!fCPnQH{Ev1S{BdtI1634XiM!
zqx$1<2g-V{usloS5>3|>+f)Cwf-bc2Ur3{8VHr1rcg-XItMa?Te-jH*w&hQy!wZRG
zbqcVzFnuyb2t&*#bcLFa>x!cJw63i2KCUZEl at II6BIPk%q3ff%!e9 at jSy)cAH<lAx
z4$$-K{kY{wZZ}SfMUvZ%U+X8pNM?7{Ax~9;=m84q<6l3?cv^CtY-Yh~s`KuN6cNPr
z at if?C5z-J8F}dm{BaNyuWbZBt>b8QqLqUd63>P#b7G(BWT*V`x{Pe1uj)<zdo<-%`
zt)Ne*ex?$CJeAGndLvFUi!`$qm2WQ691znS=nilPW?_}yN1DDx<sC(XhAlg01!kR}
z6C3nsiuj_A$5Q0iTsLnJ`E{_LQ+0EYV28-GdlbP{b%!o0{~>YJ+4K<!#`WeDDaQ2=
ziC>W4onX{e-4Qj6x`lpGetUxWs=CEB%(x@{lKkcbkyUj^$%0#oygCf|x4<pJnman+
zGb(9EiU_N^V`#S3bbE?fS9QlODqoe|XH2_{+hvQ&_a$+_bMA9s+=NbH=S6r2(dJjU
z6_5Jkd42q%@?%!oRlc9w{zc`Fhi)W(fct?(<;U%QN8(p<zj9IeNxP>9tGHjasQi at O
z)6EmOKVeb%8SeXFPQO}LIAe`l?bZzX6WxjaBzMv-e=_M#UQ~Y8(m~f#+$sK4cj_*`
zmKSRmm7hxhqhG at Bmn<qjZ;4>N(~xAR1%5f5SEnmKOn3&r&rp_XeqZ9x<nGLxKg*rv
zU&_yyE=<=6%6b{UU$!v40LFefKVQBu-2g$(=I7Z9(~WB7lc|4&dxb>lE931s`LiXz
zjwI`9{#<viKab4kyK~%m?tJ&kLcCsJY<xiwu<J>@eqs9cG;$7QUg$22G`|Q9xxroJ
zFLoPz&0Xwj{eF-VH at ZQ$vF<OSbC-ltE~S)98>RTm)Ev6M$!gl9G$mhmb$|KrgxTyi
z`^&5DazErQb3-zX62FC?Tcle8M_%Es*y*q2*_ASxOa3Z;UZura=C9`G)iwVrp1zu&
zuMT#{R(@`k-J$%6zs6m&({JP1Hd!B7p8VX7<R(A7_$s#}srhS*albC%?R5w&N6PDU
zbG?~W6vd>tA#pb(JN=D%exp<b72l+ro4A=vfv-s%MZJa+Zx-LaR_|WRyZMy%I^DdE
zn+0kZJox&=y`Iu;(eqn)zL-R}>gHB%mQvaq5_far-au)$S^FMJ{q2c}3CV9yP|ktA
zcO>qPgcYcaf;$s;XX4+OL<+;9Z%W*o^rY(EnD{p*j-uX7`6BdPc6S$dMr)?QTN3vc
znsm3lxSJOu_*?Dnt=wsW{5HFL8+Wqt-fnkq=g!EDEVw6e_fW*W_TpY%i01FGyLWKc
z9eMPfiF;=vroGR)2E*OY5C5(tZ}#s_95?Su96#<$3Wj}vuDb{5s4TYkkl4M4@{DD(
zcbf=#P$?d?6eYh?DRveqVyzEZWgkxcd+o;)sejmhAp0Ll9B+BflSh*5V^-p$D&$cs
z#B{~iQ@<;bwtJsmypNH2f8ySsL>0S>Ev46&lLHF9SjpXpMORE at i?1$haZ2svf>O-5
zpc6%110-xYCiO4sgU(C0Vo`y04)}JcY5K43#~CAxvuV{q5gkxh<K8R-0$?m-cWg%q
z)!6LD$K{!9h-No|3Jl*MrEm%^?dZ#iSnQp_tXkzKQ%@J`ldiffbKx7LYQ;R~(uq#A
ztlca}G1Ie;TSfar{r$u6OsAIVYEIdjQ}HuE6iq2=ZPV;LEzoj46e{#SEG=eI#LQ4c
zFUfj`d(`Sd^Ro}CB8YMTFAw0Q70``+&mw?FeKo-S*^$r(vI3rvT-(?RDhse5UOj1b
zG;&B*lAj at 2!G$4NHar%K`Dtq>CS^gjr3G-QVZnl9C+9>^(~+QaB-t|XH5bg2nuxER
zmd4t02#QZ|lWMX at 4oO%TQB_y|!V6_+KG{vF`>A#_P5T{nmPAus_rPA3L`VWF*Fj{X
zu0cY6W-i;px!xgYH`CTaly8;<^FMOR3JBLHHkhq4mSI0wQFG{vZHWwFIvnKY)%?M9
z;}Can#Dvt^L*1cn_W0{GBqe`Xeib$Yy1&MzNV8j at wJ0U^`XllV42L5O4azrRIxKRF
z>q3B7{IL8P#CSN0S<)B_5ojqLIhs<Au3<GC<Bz2y%iL18Y{=&<fN}$V+);EQH1;@>
zYur}UFbj_N{Ujf7$GL$a)PBEPSwq{elBhW$v}iR{SmRD`==SV61G0>bmiiOv^hxeS
zcT&xtOmQbO;02Sekfv7|r>cx)el1nLgfuUy`P1BK{&ct2o$gK at Z+pkm?9J{B1iDSo
zGi-vM$)hvbm1~e4MK7EFrS7HUZwMrO8A(v}^HsxQw0*O*J;c;z+u4}(ui(imYH0aa
z`f~uq*SWLN>Fcr&+2lKynR9NKIp>k?ypdI%@6OlUy?`>-^KN}j%H3b&E^-$X!?%HV
z7xV7o8Y;eqes2x>z2+}L>tE_FL7QLdE*tQhpi$jza&^==RbC#d+)UQZp-PB5#2vbP
z#BA9 at t}EOY)c6(X?<?I^C;F>-^eT6?dsQ9X{c2_lRc<9Cnj3j28eap)ZFASSZ8dZ@
zPnM#u$#iXk%0|r&d6e~blyyyt`uapHd4sg>jfu?~6!Q%Unz at cvwg)#SC~Y)qU3wW!
z{n}&zrA&z^VwCgid2tIGgkIf(a*YGE)T5G{O(i3()vPxpv<oeYcD{|eZ#T)f)Zd|!
zP`_y7J89b+6I3)Rw-ywz8u#YV474t4_%3Qg+cu+bQITPgQM=TTp>Xp~>DgxYHu9rl
z-)?G^*C^MwCUL~?mA1P#>O&N4vy*;Z(+-kIDJy)`uoY=oYxw<YI9m07>DHRGERUD^
z2h?!%=>rK0b;v(xYWG1Bph0&URUZlsM>jqs4Oy4c<PX&xCqKH=8jiZ;=cCCWT9Wqc
zqCM?;vr_ufDtmvz#Cc3FAESm3B<=%=|6sy|{gCvi`w$KIaN<6gxDWICBZ>Qn*$5xC
z-0l8j2|Cwt_pyQq9~Tim9*OV?mHUZE1Q_9yy8k5iu8`Z#f69tl=07cBd?s<9PTXgx
z>a+HEssEga at p$4sm$=7i%jXlvo6nQr3yJ%J5#x)27*8bbi-~&zVtgrapGy3f6ZfUW
ze<gAJ_)4<MKdFMgnz$zu_f-=8UE=<ZR*F5prdEB8R+;o#;lG}^uhYwR|Bcv&Z`#%Q
zzfT-L`S%-1Ay}SL-lxc05jiXVTXY`+d at JxPI~Mlz+e!W`_*2P$Cy7|)(~0BnJM`nb
ziTke6^m~c>cH;jbao<b)Gl}EJGs$ZIk81WmCGH;+_fItYpF{coYy=Kve!nR52Z`hF
z`;_ at FiTf8T^M{H1Vd9 at P>(g5Dqs0A45326j#Q#{0|1ph6n~FhylDMCct<(QBcKlys
z$N$Z)CI2(Kw)*Fi{PEwYKR++({`bW3_jBt0MdHXt4t|jSxy1c68GmCCsJ|q?`(;7t
zmHt<;yk93EasQFHUnlNA806=R;(t>V|J%gz_Zy1;&%}|7f1fWn?st&H{VsCc?^VO^
zsbNifYcyv5%POh(|CKoY{)@Tr2POK$@Sy!MaerjMg7xG4pG5LMC+<&)`!hBEMXCQn
zYA4+jHPe49&Hp6szZ3UAq<KMUULcL>CwK}hpxN!fZdqd)*7$2vSv%M1 at pUNzGIiId
z(b~B-?Wk5t`9akmmOHfVSinDmbOiGx+b06=wDCOdHkZ&1i7Sovp$Vvr`OvV7A5pMM
zguiU3A)*zSQ?8jE{s^PBMEuPB8X`i1!;EWVt1!Z2V<Kv17VjX|mE2~=J_<CJx^e7+
zX7`%L>&Wi0tL-Zcf+1>09|Z9dEZT{fvz?D(U8V9mX^nO7$1R8<o<7<}oxGg=d|9|R
zVQulm`KimN3ww6cZo<}@;<~p2*z=;xPhZ{%N<ZDrxXC-fXt&jSy=e74%f|rY^thQf
zVQG2SyA?~zavZSSEG=!mVOjOL*=P8J7{WQ&QgiBluFcxH=uWu<RfI0fcS`8YA?O|Q
zg3b6ts(>_yx>@eft-vxA+ZQaXt$snoOl-4lE5P!6faNp-%W;?&vRgl9UMmBtfvte+
zHSwRWx=(RY8F`iCUGXmAcS5 at 0#-X%LcpZr`LLOZw7tE1T_pp;*th*xUug_J2YX at 7l
zePJDh#jXw9KK5RLHX56+Jd)Q`n?@ygnz>e2tugYroJc#8pbH+bbZpsZNI;KwVp<Cu
z1^L^^(2os!>=kWDbh?TLv{UXHaZv*lis#U2*w;zG`LM2S7^cdENf`Kwo2ESggq2mq
z-i-!j|43_gA;beP09&tXy(dc#L*7&OGp(4J45sD7YeTFtE8<w}#LDItC%C?~em42c
z^QaCp%qjBBb#v9Jd67{Mj+4af?Kny1yLm9fd^mv(*tPy}%2^<!tuF7Qq8t!9vB{!1
z9~Z~@c%)nGj*RnhiCcupy#%oCD7SR&@IGrDK<?P0oMmp=?DArt#~Hai&dB53a(CRI
z05}=ikAk}8Sla!V*#mCnTEB`mo#0lv6YAh+tBbPMxHYrOr}z^A#xSn2tWUz0KG~hJ
z)}KoHweD26wvH|R60~EL2d6O(rxoL%K|DQ>^$d50_~T3s=~?bfch**HXCwH_+{;9~
zmoM)F7kjxodmEPWOWiBBVkz7EbKE)V%sTf<w{ELHH#|JgohM42AKf-Ca2H at 3uiu4b
ze7?JI8^-au?xL;0UZ)SkUZ-Ie?+JS)SjQE?UIpiLVJcVLWtg#XG&ga(2}{U~5L%RB
zKy&HzmxG0EHtcJ&8(QU~k?gLhV<%q;Ig~2HUcO3vc(v}@F_~YbyH~rbv6f#AxV6<?
zv(|5;VcXp{x4n*yZ0xuL4HNQTD`Qz4d7Ty59;{_OxFNxM7MgWKA{dJwHzjLjEhoTN
zvXG;#{MtrYual*$+4}le)-4IfvL4(j16qjHtx3Z;9>6%JEG%M7;X5!VFpuw)6{K;-
zK(1gPi*A4f*3&o1IK(yvvU)RG?k=SeI+ca4zeTSxg5Q$3yLk;PRlypDu+cH#ZF(SJ
ziY+n9n~UX)aKdD+U^mx<NZp%60rhr}ns<$#qMeShNVX=Ekjb_dlZ>fJMybZtyq|3E
z({1y(b`MLvXXSQV_$H4OduLG{9&M+@>EASB-@>+#uJ<j-chp~j?6@&&y$E6`0-m)t
zd}9$xGjB2d*4U%I^9{0rvS`woFPk6*e2WC<{rpb7orng+QKyq#A!K0oO<Uel_#~3C
z7khS;ZfAiwRHO at +ic(9pO*Ff_40YJG6X95KQ%rpKrDuUaFyBlm)=-L;m^}~8#OZGO
zP9$P?@XV|@hxMTZYfz4s4p>L?k}cVtY{>R`{8HSkjs5_n<^eTiV#OUO|5VbhL^@Vn
zpFC2zTa3oE0@%yY9Gw&AC_+-2au<4Xjyrgp_Aqv$C}+F*H6&xj9jetb85bZUEk8T4
zn)nOcf}IG-!@?GZy}>U<Z`g*-?l&SIDGxyH#xAKN9T$e(jEekG=7%DW)@3np#L>vf
zV}K)$b<0+<>6)-Pk-Oz}e;j7vaY2==pfM|kiM<A4S#igMEk%LQ&u#xg(o^Jo1Zc$#
zz_f*USb;d@#Y#|%l at h1O(i7b3RelY1od{@gVqIGiQ9LX|o|1SOA{BTd(_TdO8SBwn
zWbE1+g0<pa0vLml7$vI<Ju$Jm0;u8)<vUXk`+{H{M6jOjURq=Cp&~#c>g<;@geG4H
zk*^i^ipU}>1V~5%oWsL&8rpgt-CJjh^<18xD{6#k-tEr|4LIMO=Pz*QBX%!v>sR>;
z>E1=|LU&Q!Z?Gw_p-><fEByj(J@`Se3EOu#*<V6>a0z#59eI0Mk#<w$6_c=ClCbXb
zjR at IIZgUM>!V21g=TrnE+?A{SRTOl!yUJZ%mvD{J>D4rv?F2O%%5CLNI|{`Rbs~2u
zs!Q;M<Ss%NBw`24zH24$5VQ#0>kxR?3z(qDC=6~$8qy#Oau=a{J+c>xfVjOW336BX
z1OoWA2;+U>6EYoXfyrI2ok8*zdj+>8^GYQN->!!69SXu1F}%e at ub?^PtmPdga__Rd
z$l9)ktR0f9<?n9tB4qh{BYgvxKtjJgSyD2r#9AX^tJd6`<Yj^W$lP}%LxLiB^G<~D
zeTlmdWmNL_TgqPH5tifK%JG0<4-b&zJ%<0kha3+kK>EtDQ*Z>}#a9229)U&3TQ=l=
zugLyzgzsy@@+%+_jF_p0$Ny8J at zG*+w<~e{JxbZ{6P_Sc4_KlkSmOPOFp0;6Nq|g%
zLks{+Q1pidO at K+%!6b}1KY~g`mXhnEs_UcFl@;8Pfm4v@<AN#tClcTZArbueWO5?-
z1Ft?U$N=b}E(GE;Vi0v>1t>)BfFOi9e2(TlZjUoi2=aVEI0P8PkpE%=HUauj^G^tY
z5VZ8A07`sWAOr|R9Uy|HJt-(6fDi&8KqLN+4t-6feLVq!0C%Vhb)atNfg1Sx2Bm^H
zRDc~=f$B}>pHhZzCEyMK4K at F5z5cd3Y~ZM6{ti?DUI?fePv-ikt=O}KH(-jN;=d=L
zVW;p0p8NyFfH^#!fHwd;GysQ_0UW67`-%H!Aq#c?1C{#&TWlwS87cq`nw6C*2bwgf
zKP(3I*#w9I^r7Z|lz=&CP=9Pqm at AxtCM^@pz)$}DaWeiU$w-?*bN#=@bpIwCK^cE$
zn8ME}>p4~V990IBVg(2UQqZ6W00=`LDj*NqHn1vwsn@?s03Co0YW~;C{%c+ at izW)7
zLj}x%E*RsUE$rdhlK+h!3vK{)_-zuciRZCV`d;*akbGg<<+IaK-IR<gY?IL*b-~py
zB!newWj at 6EOWr)u_tiXkaOe?6+?Ka!x5>^K<X1Ip5`1a!zm91zNII&QaON!H1EXDe
zN7_Qgc`Xq~Cmfox&zf#()&p7q3}ZOJ5VlaWYpeM&dB%<kyOCP_wa213zja<Rrl at Un
z_hORr$ZiI*P at 9(4xGu#?@f|i_U#e)MKdQ}*iTPVQs{foM)AnA6l at 0A|?<j1EHc@~a
z>}|?JIMrh`HeWmSs(IMFTQ$nObC`BXNIZ^FFehyJv0Ev&n3YjCVO|MiPDV-Lgs23q
z+9=X~QS^7R0g;zzT+-?Q4L;b#=9=ZeSG1#g{Rmgkzkz_2?DIBGv71&+`yO4i;@eoN
zw`HHR<YO2_9Z-S7IDYMMTIBGg`9%mENK%0$#-rwv%XzLenK_(kq(l73k?$r~935Q6
zq!w-T<vm`OS5e03Md!7yV#vDKT}_HXg6Y^@J#p0Tsv8#vs#{=#h;M_WQzTRo!6ug#
z(CkF20(mfxH+EA^OLPW-Nb+_paBCO?s5|rT%`mvFDDwe#lZ&FJ;@LJe(iAs(DO5R<
zPSb&?vK&Y|qZ{0GdSJLz&70#W)|bg-%i?nfLQ!@-HIE4%$>lq$KTngn1xfdIegj&?
zqjBx|Pg#P%_;JYrQKPSP_LvqH&@>!_3uoK@>Tb0tYKT|6C at K!muriG2hs5)4{1#z%
zjYMF0*-x#aN%>C=(B9McR;!v){-h}b+caekKowiD at TM9wcE>KTc9EYnYZnRn8Jb`K
zI-Y+5nzP!^B)55F)%~n=nVTh(K2&}{EaO0s92PKr%52!4)sq41r0$@aV4Xnhj_QY+
z+O&+^AHb{(pe7Kl0h}fIzPPz-d6YF^o%s%2LlN7tYWZ$Aanyq`GQm0A_d<Y1*JoN@
zZ8Us$r)M?X<c>AmTrgc$02J^}dvwSg&uB~t0_a4X3KOIcE|+z!7UR_DLiWhjk^@mD
zdz5z?3C(+V2zH(7_BU$FNP*hyY#V-7hvI&h*i$P>*wgiFh3aM3Yo}aP%q)trQw&-v
zmbFy0b$-av2Z<oPkhFlQp5f<`-n`uE{$QJY2W#XKe+d8C0lqqm#6yKW{D2d=Y*<H<
z%3b#`Hm&m?aE`UaI`ME at C+45E_4Fcc<*%Fndx<^k#jFgEbPKYNh$*o>t at UhU@AOBZ
z$_!D<US|w at G)+Fbp)rqPyyB{nS-H&R1AaNpGB~a7S6Fjb(Ap9!#j;k4=Eq^8p`RH7
z#O4NSEDL$k3d*LP?3FAES7m=Txx1Rt7Tjj9f!ng1)drHCNNFdA(x~nv?oQ&)gh#hO
zId`lY%kGq#R*g|UoZL~pcf^sJy~>BclI2K0%nw(tH3Y8prP(OoLem4xA82AQzlM=9
zc*>^pm{^89^-&zCe9g+S<g;T`H5~JwwuBi8!ns%_n1I7)gELo;;!G$3Z<g|}n?UG_
z<>SHPy6kkRvNsr1@)QQuBpQQAm^O^>uG|gZHK&w_$AIsulv(*_jgNMh3J3v!j8gu3
zb|=K72fEoc_;eP?e;V-^EgR;LZq6uT%#Do&;&5~2Z_yR}EwaDZAkQbmd~uW=AeQe1
zr*jUA^f{b7=6SI|O#|zY%LRJ^3*DmCFmGbKdt}`&fj&z_ADy~M<wBv+2t_XC0eDAy
zJY;t?TKpIu*#VqmBYPa(QT=%PEXg$O8p&Z;@R&9kv?-wHVbdUNGE9h at 4LTbd{!3%p
zknXTT4OL_Wo7%VIDY3F<SJIKt=G{G|beOGK$dVmX!AnZpi8AVVC|@be+C}BFxZpp0
z8W)apox)|zqVh_mJPl>vEdOg<MzrXX+Z7&FV!0p{2ljF#E+}#;Sm`G&FG(gA`wDKq
zwWFM+pzxHKTBYMgEMs9Bl^_&hxbASCVE}Pxy2k+mdop<@u(e0tP+ztopUGpk73^?@
zy)xG8^0R3FEEW}%86BaP^7BBRA|)5Alw|;FTux!TA(fDH2Mxh-eQr+8gQAH?=8<|H
zfZ6ixz-i6yU^ek6;}CuxvZQpN8A%kdfWr-P^s#M+u=_NCsFX)A$Gqxr*+OXo=yDd^
zW*NZLv|2xsZ3gf(Z8f-~PQ{}q!J}uXJG$<Vp;yPigUYoMv1qw$!=V-7h=b2oAhl&~
zIl~;?#}*(dkJH#PVgdw at XaAueM<0`mr}>qXy$XT2s_svq>=UePwVh1}<yu3h*666Q
z)pR0v$i&6AQMv%3*es#AG`u|7oh()oyPRUaH}gsd4sTNkO~d5}`k0l~te2-n<KXl-
zd%@G(={4}QGezaI+-ZPmXMv=>)V=Iv|8iPt5L(^8f|kA_wsf%|G_~}cK$CUu9DlA`
z2T*pdJMS!iKDhwO+y!;N-o)$rM!pMa&xO`djle~r6B``fx;kiV1jef;pl9G`A}Q0|
zo)7BzV$igWs`(O-w83it)Hb?HNu^^j*;kASFQW=JGt3WZ0q#F?;oVehUAWByeu(A)
zsktq6e+8YoqA^v at VjIKiW(UJv*=W$!;h}A0w95C#UA+~A?Nx3o6;u2*RCtZR2l6fh
zVdHX&-wq$|2q4=KARAj9*Rou`PAHq`e7)d~8w9rPww*Ds=XQpX(6%(Jg4e;ms^RtW
zLSZ}O_1e)G^0%5#(5+PR21~GjT at C8E-TZ#|_qHS|-#d5>s;2#VkydE>ol4362A;S;
zY{Bi8A8!VHxl8y48yI(K8>0 at w2F30cz7cW)vb~iKWx#9{0mAk+^Zdd82P6#$?Ouv_
zhl&7EdxsFUdO*~Q{P$~d_b#DiZrH_Vfn0n_g4EtE7atsbV$&Z~(;rmR3tt2Pww(#;
zA*lF=lE~C}{_yt@#CTYqKce^}a`g$-E&y5sP778?Rib>C3f at KX_X%JN;=H3eJt=R`
zpEG^FXB?A`>hD^U+X%-3)9onv(c8*efzQRxpsA8lrYQ`U)*{@thv#acFF|XoeA at sP
z3e}oFWrjdo><BA<E&OF~pd1YU>BH>6eG|n;$?P!;=;$^12S(9boQNAfWOCx at LnfV)
zf25p?8KOgO7|2?lTi(XHeB|+7S;8RduKTEjy_-Xl34VVno^H1B3|1#I>fYIn9qz6B
z9y~aE2K-D0eRx>{*USp at jEcJa0lYN7PF5upjs0On=|ueNM2OkQ2!0}54ETAZwIjZD
ze~3+mL&SX)(}|hPrQ09MM6g}vAte6%Xjk|nU~fv(8IU754T8A0J}xG)Z7&aT6eJM)
zs7P#%gJ>V+Xm_+*f+HsuGB&U~w&oG`Zjo%H5(2;Cau~26xI^|EQI0bjtiX+PMIC{^
zd|2SG8YS>0?#11!DQGo=7<#gXC+w9J8_w?J0e=epKGmJ#POba3*6+2k- at X1N^x-u3
z5_j5=KRx#0jMxX{xI4p6Wk{;Ols=d!ui>8ga!rG?RiE>(poCYtSGZRWA;WRYtoe0r
zo!+0T;}?N)cwzeUs<!D-aTj386 at 6~?>uK(VZoT$DJhIzu81NU<Tsv%0XVYUaGFiLd
z$m2^8(wEeVZD;FotM))7H{xD%YXQx+`?^+pqjs8CkJ at R5mDp=8Db-cbP$w%4n{3BJ
zGmPh}A}u7UUsXd?Z`JzDc0hXBc}~O5^N{58HEe>|X^Lx2NFbk$Jl9K3Bb;lJ(fnP_
zo<VqXqlDQ_aqqc?u%-#GF<~nIDC2~i1GZi4iPYH>QC{1FMp(a&QW4t~q_(YXj4N+d
zu?TLYcsxE)L1Htp3%YbklHU%(dd2?@j!7LvYKi1&iR7`M1U1Z19sX<1zHV}CUx4N<
zNs<+^u8hn!^vdEE1SGRbvQCEz@#!KilL#IUAlmtOK`$KY)e&vp3AT*~@P?3W(RU^u
zxil2CSWXqF3xjR23n3rega9+=HyF~er)18au%1|usc)B?W?{@X^Irr0Z9?u at +XmGr
zHNTq+&F|tu^E<h4x*_Ajfxq^67#WTSI>cIkk&_LYpmuZ_w8&>tuiSM5PNXy&Pp=_A
zF^)lJ90LMWDA<BfQg*>7GhNNYs|G~D>EaUBI3>s$Pi2j}?vW6M-V>w6XDl_nqqS^k
zS7 at f^r!S@}+41^PDeWH6!pC?EZq}+{AzViY_i4tGN`CdTWe*>uyH1R7D`74zGq7SP
zY-a07 at geRY*+`ZN_b*vxhlWgtg-nN&$y|Phw8D!A@^pBQMYd|PU<(<W+AVIexW;NK
zP8lPCkKzR at z91Vc2NJrYdg<`i!Du8uaH475R?Rp=@I|F&M!d0mU~{33>1NvoBl%C7
zjz~OUKE{3)il?Lc(Q>l5;QW+=hP^0MA9~A{d(b^<d2{~gg2Qlm)s(RM8r;ua=ZkPZ
z#l&Tu6u8U!3uMbpsQZbg2JDEjR&*5|7ZvbArsyuwUWjPLWw45YzC?jhsF$^&0V<4S
zslA|H`@ub4tm9^`!Yd;2<`q%*98-@%zSRRTirT1h_3lNMaJ+91%G?-eUJ1uR333)}
z?$CO0gUC?g0pX&D9Lq3zwsR~a5k?S#bX25{1H49&Vxfvy2&9PHZuI++9o4&<n&va@
z<s#EK_&eH4>owDjtb4WA%px%}E&HMglxV35n`cyIkBu<9+Envgc#2343N$eyO2OV@
zuUM-Uq2O^VIy%L4QuIWM7A()tIK(4t4Tf3WHB7#O)iz1uBEXrehGAr#yy3X7X`Kvf
ztu?IR6}W-y4q1Umpv7#^0Q68{&k}&=5~EykGa+NaMcuyFRpyndU`Gs+ftz&(yO)uU
zUZSf;d5JnaM6K3NgauVG)`$Re3LL^%n-?5k4wZN at j~DrelB^3U!KD)^VVOeQ0id8E
z2KI<}a984G2~tF7b(dsMifkeEQ80!fnX=p}0x`|P=*ECch7GuiLJxEgYtx*U<_M$c
z0~L2%v_PF-!3);w?)VxOoVmTYB3H%&ejxjTjo3=wt&GgsJM2hUE9^Q`-1&Ug@)ftH
zhHZCZ;5Oo8q%jbsE9X0|`&Oxr(kSQUdQWp+?$-iES_Bnahn5UV-p!ftKhoQLdq>p-
zK&k|3UfFvBr{&P at xg6dRwH|<}lkFw2+=wF4{Wl4J?GXx!@AQb5tcuW!9B*+=lG=9a
z53wdKNkb{qH(Qg+xl=cqeOg{-4-}k`2t(N$tl)CK{RGOkI*~^Cw0=xrx}#8vCQcO@
zGI8ZWPaap^wN<ea=|=#P@{&u2&plP=mC|LdlwYOToi2f5yY-fL8)^ljTwRy64GZsm
zh35>6)Rc$7kj*o1wIP;q(A<c%WZb4GbH<{!E<<OrJJnilLd4wMSPAYsPD3A^gPbE{
zp5R%mWuO8(S$GC*rO1jR9;oC at 49mcTv?+;c)QyPY;t(-*j!#J7KEpfpVl-KveS~)+
zt^k$bXFM!M21bby8%sqFP%*uLfmYgdCbjM24IxVKE;mskEn_iFdJ!X*te0e_EQb66
zY?!bEWqP85^sv%qhhztl%-qa}Sn<skk_ at 0MtL0$57pB|;5Xd>c>}lJnm>+Tzz(mwI
zjNEpNZwMiBXpCH<+zY_~!IP07oZlO01TT at h9Q<<>ePW_7h15XEbK9GWI4Zpe5i7W{
z{2OBVceGnpW9gSmq%Ak1EN>9yxR8DY4KcYfgxomJ_17dUG_os0dsl^ICy>ni<Ax9o
ztD<O7drxvJ-ANnqj%(66zM#lyNIUC59pXDhj>o`xYu!s$gW)B38K34(uge`yGS^a`
ziKGw5 at y`sazSX+{cy!z#JWKsb$FV$F53QcHpFgK+AYPMP-&!&7&!Zvd)i{QKzP~`l
zv*^>MEBd`%z^ZROC+@?TYzQeXc6Q)iUT<!2V0i6U_m@)jrGe&`Q84)2Tpd8MayRM1
zx)0a4O at plbn%w14xyypKIydXmi$5HfZdQX8`NO#@h5+n_+?6#Jf>(*uSBGS;3dvqg
zGF*<`)*;q at uX5Maz~;88BilK}w|yhaKgf7(;x at Ae1f9$N-q`CpBopu)>%Wos9Byy@
zEc?j7ijU>rYXGutPUzIl(btF7UK0**ftOebT0=)U!`Z5 at DXjXWQ5e=+l}U%^@zx>3
zTvmXnjN9}yp47WN=|vXpD6uN+sJ_se+};o~d)?`J*1)5E>~v8f0uAvJqvQ_Yy^=_d
zxN6<QaK_i|8l+In*tkZ)ea4MtYpV!<lgVlrPiYfm6_^7r(M$B(4hPpUPdeRx1EV$>
z*kX_}V?C3~-NUxjLM_u`Ee(f({rgg)OH7Bh(;H2h5fcF|$$Qs3PoRbbdDGKKGBYM&
z%fZd8c at AHTRSuxec$h)^uWt6hZU##u|Gb!=Gc7tj%<`r$eMmd8^BYMHjY$Zaq}WNA
zsf6CpsfGoOBuB&~piWxv*b1kF3|gH1%DC%D`hVo8{x69+6jY8ik3~@`34 at 9`j!{@8
z2y$%Rjv(Spu?eF1a(TWD^Mx}GHMSa#bt^U^f{qEM3%Q`sK<*GgOXHd1nCk?#V#%@E
z4Y<`C5kvh!43VSTp9I0;iHCqb4RCZ=5ke3=bp-}tTof84^2pwH5xOOWVpT6CuK`el
z0-(Iyxv7BgA0CIYvw@&abg!%dL8bD<ZH<MlizESyQbZ6=KG;y4Pu1s3?AnIn1$kHS
z#MR;8RGSK|D*7URUxZ>deQ9TvHpEgdj-_&3Sf_>osciH%BC`Off@|(7e;Ezl<SuiY
z>Jms2tybIR`87KE5S(&T9Tv8I%?h$8u<;eKbPhteD at -7{Ej19SN8_R4nl=tJakkQK
zI|MN#apbPq;<vMzyMwI)$)X*@&bTLOlOPHqFsM2k1rXpy!``h;h@<Tdaijnqbp2+8
z33Ams{#s=RdqOk`a{_uw*(`uIX3ts?x9FL0sA3Zp9LmCYqy-FhTM`A(tq7o<lghu*
zzUbk at Ce`!N`Nl$?$E3EP+JNCp!52jO?Cubv#eQQQwKCl8Ep9elC>G->yaxq3hPiK#
zqXiRGC>}W?v=&p7B$Z4Y;RK at JUEnPiA%bWHqfH&x9VFELw3{9FjpEcWwA16zg0(36
z9U{mkq^B*uF%f!VB6AI~NVqot5JQb5du+TD#-Q8(6~x;e(#StQR-vtQP7ol4Y+4|O
z#+I^oo5}juL=n;oS;Q2=h`qocNe_-}6u&P)WJ{`znBtKf$HZoE`bvB4VB;b>#!(@Y
zAmH+Nv|<C&$zGegsm>JukGtbzI8L=<gH~BOgHia5FlwtXg;qtxbpx&=6?ej at 0a+be
zfai!?&Dsjd*ocB~a%{yZ<YGtkEacy*2$@rB*iUQYyLiOlG<Tu~;52H)n8K1eon6o~
z+?fOZEVzzcQ1{Zhf0^;z%Z7RG<+1X!fz4lDlPFUm7*nxu4sZE&LCgtDUq?_QBv?fO
z4rIxl&(HG}Dyf87Ww83$#cF#c@(fa}6SD3utRc!S5+83M>BZvRn%m%NpzHd*(Oqn@
zjNG8RWWZladoA2S9oa?PB3py645zl%2}^gE*AQQuRooEr$96w;rT{C77zg<*45%x0
zrXU13==ZOtKr@}{{+c-Lu94)Wc`FfH*2?Xqk=R0J8G>HHhN5H0w8l=S>O+Wi1xpGc
zHl*E7{%*iAAMu6oFmp<>>orN^6hZ%AJ4LWTmJ^LQdpF2j=YKdv5T at -N#b)50+Sp{r
zv&I%DtYSNy9NDThf(wDGkKo9&>>QU0XWP?uK5IHyP+{9sp4^zEMg?|0jSO$IlLn2r
z2kP=Y>T<FFIizC+{M}2J-<jmkmHd6$2le+S4z2#Kxb at lZ-z{nVKs--SXY<n-?>#2&
zsc3`F6<C*d+F^p7wErP{a+>EbLFn?sMVBA3xCRfC=Fud_aZTHuF79uVi(St5j_mSj
zoF at o{Kc>S3?)_x^08cnlFtW=Zv_$JTVZcEH&KG3Qn9%ufXzGX6)L~x!NKE`u#YEt|
zL5)KOoG%c4J}&xvLgx%PWH6+&2K;>j`f%3Z52bL}KnD$wR-e^z1CAFAaLNGsJRSob
z)H!0{K2L?Qk}v4x7j at de#bX8)4jRCXQgw|X`%+B(<#_6##;F4i9H>b|Kj>;iKNt$p
z4>*6YMTZZ5R|=;Obn at WqiOwDH<D0Ap{$8~{Wf2daLO(ZR8^oEi!A>5Ck-nqQ1s2WV
z>0+q9TMX6rbk2Y?1~nZu_y=~Ox^>ckkv>Id4W8C<1A-fj;Li;@ePEoxnFCAr{doMK
z#_<E9AaMRbArA6GIBdZ21NXyZNy(<}DFim4#Xm~ivk6BIYVp9q+5RVX(BQ{B`6)$m
z;$Y;c|0_){`G3=ke<Q`u6t+O;4E{CYph5PJ#sWWw1&D3{V$)L1K4Fjkz3AyLik|+`
zjv4%dGJd7g2By4zZE+EPoorHggK*NI3x<7OH_wxjg9g7zIAxH1(hB`go@)*IJH7uM
z%@h#(pE_5Qf7iSre_vGgUkYVFq=Oo94e}`pY4F>Gm<9 at QP|2P#Tl|kI<c}1>?zYHI
zT!ZZ)u7P42{5kReoe<Z65C;83GJtL`D3*bHfsx*k+T`4k29emU=mlL0UT|%y$OYFC
zcunyO+zsgo{>GH7H>K{z)ZLU4+9h?b0l>q!=<+wGG~i~iw18W0 at UP{8_~mt}<LB#A
zjJ6bN+ at f^1aCd9!ZcY6gELFF^4Ra1VEp at l0BiN(c6R*)qxg&M_-A=wcRluE8bT$DR
zhUdc at E#W%<rquECO(cAC>fW4cKHO!mHu$&1vhG$d-kLgsO1_nJZ&QJ9Q-O{Yck|@!
zspIdi6jswdsMOd!%5!h(?n&Lfq<Dw&yn{S8DV1b{e`hTIzSQyaK2qJEqVC?6y8Bc2
zE)^Eiy<2(TP2P=G_D26eD&_GWJ$?_l9!%YXX|z5cAY8;3%E{5gHtI;&Z={JT=3)48
z2o);8_Kh$E+ZnXr^rAy6!(=B0h6L at qwk@oetJ(9$jJ9NU?HEuEKMZqDw9>|f(i#!A
z^^KHqZv2R&5?$c?gg{kDE%pGn0L3BuMqzRN0gHo*7D=lP{x;1P#`_1p(|1O!&5k4<
zyVNU=){N*tujcpC{00OEe!oQ!Tt;Y;9VOhs<?T7!AH>vU_s=NZ3KC}}&Sk^V_WSVb
zbR1qCv0c*@c58-YfVxAs_`}t1w)sHs7la*}eyw}`K~;B<tj!Qhw6U$H<Ej;N<g}q|
z$J37B={yRtv*P%$&398JSw1|5f5W~T{fV1yj7)i%I%4iM+F4+)TUM%}w~E;i4h#YC
zhJ!nzmATTe58^7?@fZi?%;kKDjX1XAwt#s2N}jwUe9Ze<Uuby|e9R%EJb6{E&Pj0T
z9NxtxO0=S&Pf^4MOVaMUO@<h_O4 at b?mv6_&aMTfr8G&jJy0EKjafGc|o at VtpqBsnJ
zva%_+g(04=AqHZ#unq1&&L&fYfgN=Xu2pl88m4~Dp at 6xJ=v>7~V}!S`MRTOKXh1p?
zyn}tX6983{0IJpnZb({19ja}(F=3E5GRRiqVoEF=YVe&rati~z$j&3vGIOC+H0X-L
zK-<Pl=8vU#JC7_V!`x}KSFY9ja&DIw^01r5!*Lzeuck?F!3TQ<z2XYgmgL&FugrXd
zF*x&D<Jv^~Y*G)|c7OAhzZRv?5w(R;%Kq6(vF~=M=`5JrakN!@LJPXiI+O1u46x1=
z at YU%i0el&)_d6b9Tk6SqIi7XW7l5W at -&ctuC2zsPEVZI}#0F2JX85C(x!284HhwDN
z2ZdozwI*$)rs at 17P$4n1Yym$*mx^~J^4isNK2)M3O+6&-8F7}Tqx#!q_=^L3CFNV#
z`GA~pi30(nr+hp<W1(@3&JvjMP>1AWm{9|M_pkuW2?YiSZ3Rcvcy!R#Olz3&uEUMT
zp+mnXGRGz=2J!^gIWTHkxyfN&-bHV^0&*6LBL<F+{TIl^RC;U#)Up5WC`P6dL%Joo
ztYDvgZ_Vr$LOa^i&5;m|ouDX0?28*tBIR+p5m>BjKG~rI=@I$GMNd030lQd5pOdhk
zU)P5gpvritB1SOOcLEfN#{&W<4WJAX=>j*o?z>3dg_3!|++Ftku!2}g-wT0V0*vf}
zgWkySF9Y6Fpa~8O|Ket9RH)o}cB6zmo0iCaVTKV~`{8Y$KQL0FFH!=9p`eunI*{x3
z;}$r}^3S0U$Q3CKGzK4|8W?qu#X5axtaL7_gG{|a#$-RRc at _mt_Ak0K!t^0*!n$T*
ztR1zY{eP5-J3_UG9vn&4(Q`X&1P}Pe$2bTQV^tF<Ccn;@ds!?Cz(QE6A~lDzVaHJn
z?t5Sq=B(}d2S85~&{IOi)cq<(VHNF<XDWbDMfcUQfoo!+Z1W0BCES7DvX>*4Aksnh
zL+j(I4AZG{FtR<KEdLE(V=MW!F*6}$6ae8g82NNk;J$Z;Tg@)d8O13K34_pyGh-Hj
zR>fc at CS&Zx%jm?*-OF(EdpXd=+3pns{*`p%9QR6hPTjAg6YGXM0Y0H-veP3*IA51;
zP>Bm-YuCrlT^K7<{6%01ytS3<;?OzJ)R=sbHV;ZrHo1!>BbpSTxxxjwqm=Kou5OG)
z$$NXFJOa;im$FM7qJC7X*|Q>3#Ii1<v6s=<>_^5C*^ljlPGP^uZK?^L?d8!9J{$Pe
zsmXU6(ZAB1<M6{yIw1{`3?~mQm}T*qdvSEvxlhmV%4N|Wm~Hn-4`g5#ppM`2fue;e
zQD3zEIx^zZ?Xi$sxZ1jg_+JqZ!Vr`?xgn9b;P@;9ggH?tk$ru+BQ#~t(}OAk*RY1`
zvhDXwja1cFR;HTx(*T*5?q3xGvD3Y=2=GdMhz<!LwSuw80K_DFiU8HZ;la_3+2d5u
z6FSnS0AUAXTpSWN@<pZx at cRjJ0cjJ+c37<I at Yo!Lg4}rt808LwizG2aLiSg(Uz)I5
z6tgUjS at 8Xl$B1IJn#-<&y>p^NDw9?$Wi;6El^c7o5bxG856ZkDe!VCAtyl^Zr6gq2
z=P=>1&>&WeebPtf;XRiX(rCwHVQ{>|^$&UWd|m$*9DWVO1iOsUPzKm$S>tme#198;
zt^1QH{p8`&!7QzG^ZGTwQl-Cy(hFx_!knD8#h(FS2U5v4%b5YD?DsEYk~8n!%j^DZ
zqv_ePSQ`5Z-kGnjHd-v?$=$}LyjmB#ey&^R&W&9^k4Bw`b}ARN2C}qf*^!PKAmw^B
z>q44k;ZKGn5!^*v{KZ<5jS70wKPu>nwM#Qv<$--QBD2<jC at b3Zy-zU0rjURb;F}|q
zKo}J5xMcaaY>mGqj<ms-HGd`Y at +$r#-ju~wXFpdFTVEAd`LAXuV&Ex|OM<hm at gtDS
zI-4%kd9ARkDB7<ZKBJ+AvIofAtpHg{jJ<HXxZ}nGWw^;UgKq?70LQ%ELQpkg7;P7-
z$@-prk0xw(7dA8nvLh2nI%;?3^@c><A|&coo#|j_ct`*zf471}ahwD25J_`ej8LTj
zDR&6l>=&>Zk8S{L)&Vw+)^9Gr!+R7uofL*<f at zKbH1ARO?~>0D>wBenTf{RJFNIfc
zA3>RWbfiO*{$6W89Rb&5O-~P9e!o%XodsnOtNbZ~%KZ|T?^42e!h_uR3+F6?J#824
zS#b{pLbZ(~)O)NV81+Fr2+|1hDh7N=gnDn{5bE!RP!C&(*Y`rGM+~xiR1j!DKK=U=
zw at avJKR72{d`t)@IA<M@(+Kqebu#~mogM#R(F0IU13GI8rt;w=D%g({YT~2zx`Ar~
zatcWLm;rbIoTZV0Q~)*waefLIlQ6DHz%~Ik>)@Nv__L5dGT-L}aEio_ThAf!=S7>(
zQ~3P+gSUzKzECvqix$!K3y}B;xea_t3BSbMmlK6sc|vhjhDi+4Spn-bUThn|iceZS
zQ247!2>n_M;HM}I_Sw}4yfP&ClfSP+;cq7SPf7~A^38~UHlnTcgL%@y0`RO8R)r2e
zE&4{a{#`xz?g*~>o&ZpF8T>QMw`bJWXK3rP{8>I<TLK7;Q~sX}-~2~9N!*pkOA5SF
zA?}I+pJWFR{lSPngMEhhE6l&PYNRlj=nt(ru+T4=Mfjtl^dBqIiUFfPQO=*xf>ZLF
z`22Cn|I}Xnl$wA;pBfE^J{zO23={h~WmgmyoA|L;Kj)#z(SJ{XN{Pl2;qzZW5J>V1
z<H4Z!e_6EXR~F6nmsAT-`fHjEB>o at TqJ3VGV1<r8FLV@)6nL|r;IGggY!sOEcQDuQ
zMe5&=AoYI<6cwq3f|_Up5v^FL*LC^V_^fcr|FNk0PZr?yk5v6<SiU4kv;rJj6Dmsf
zKMNQg?lHh<1$5MUJf=__0H&eGJ5pd%!o9LjB#czNb)68?>xGp9lnx0mHOO>}@Y4AW
zymU}_DF-Y-0@=O}h25OeN#Ie?)7MfWSZM{QR60HO2~aeo1UKb(12|Q+Z&0-Wr-qsi
z2`c69wsecXBNdDVP<lt|0YmTPI}{NW9RL-jMi5cZ(Yt_f-;zSsx1^@X=Fu*Hw+b!5
z(IJ7O{JoVz-=2b&0Dppv-ji+=78<B7DAZU1(39Wqq#O4M{!|O^S78MJI^^G-f_nmm
zZV}`;Ux?=eDUj!&AWzDBkgPjJft at 2s^pG0--qbymy7!XcVI_FjNMz_1O at 1VeX49i-
zoLak5$KRtQe_!g}2dV(8y<flF`+4?Qn&Zjvfz&;gx({&w!PN2h2$hzK#2-?nd=}|L
zsrxV~K4N)4Lam>+RbuG*N7Ecx^D#B(V<h5hN_<b~6RG=X>OPUi&4f?t?I+2&!$x#R
z;y)#de=5Z&X*S-vg@{yo^l1%3oY$X8-DeC|`7GIuw?C(HK1Vs5^0%58dA#WB=Tpbu
z<22w4h9iGbDZj|=6R9Kn7gF~`arB)nfiLOpmng@?2%Y at 09(<Vx!Q7{TUn$7>WKqRe
zQ^((vl=*k5<L at gZe+ZCE|BHkFH4*IVsry>$zD~MtsMK$e>~7J^5 at 95LvncuRi;|y8
z9e;mMj&G%ozi*P|KBE+B{@eEG+dQ&0&UaMicer~rSk8>$)3!ePu3mhXGQO9(@1=2#
z^fbu%bBZbyd1KF`HIf}=bQ`#k at bZAG4S)pc18aP5w(i>)mNseAAj$R?Frfl6B2*|Q
z85bE4|1vkuK9H+g<Qs2MAquyH-9Jg0v#tnVRm0iLO^WM^h>}bOS)V-osYB2?kfIW;
zoXSu9&mPB_<-e0;L~N7H%LAEt#jznR6=UeA3gd4JMLeAW^Xw4G>Zuag#s6LNxJoHW
z%PcQ`VXC4v_DAQcTD54jZ_KqT1lS#BzgA;{b_YtN4^&&Ms&+;0dxs_1gCYc at 7V?!6
zM9gfI at v#I8L8I^VhDbib2i|FI*yDInS+O&L%80mOUy1Ih{(8i*mQwC~;z}`KwieHU
zBeL(=Vu;b!NvIf2s0sB`Y?9=g8sqMCpbKDqkfm`IVirtQR^);hHYi3eU~T*lRxlsd
z)7qGCfeLx(fU#b5X$)&kGT1DCuPN?5m&egan?gx^cT_=jfW|mBHTokY)H*{zg%QJ1
zs)HLvyilx<)Wy3{J1R3_8y#fTH_tw#B_|=P)Lebuh2{5-f{za%`2o?#8N02Ia<#2O
zxgq`R{1J=4aZoItm9wJnn9p<e4O`33qh-W6u#ixl{$P4B3YZ at l1<X at EeuuGmot-!$
z`>Hjs%4ja6B#<%yvoMk!)!UQdkE`P3KVrP&SO*WHviRDSt>7qGZGTKS`lpPycE?9t
zYDbe}Jl8hFfJG_;6de&iWIkNMsqGO_((L=UzCdR8ibgkp{dQZeBi#ue7p2DD{K?Q*
zZ0QZ-(R+(yW`&H4g>=NBupJ)l(6L9y4xLn;f%}R}+bN|y37u^FJVT(e;IdnQ^~2|$
zr^LeojSt=O<(uq!je}5Xc6s0)VrYb at LI&N3=)d~BdKx0px?+et2Sc)d5=%u}xx`SE
zu$+IYXH)rp{TGs?ah7SXBzxfZq}&UDFOk}8L#yK at Fjf%>3;xNzWLzAw?k>G(zIg~s
zd?I@&ygl|9As)X-n|;i<x7aDMtqE<FDDqeu_dG)%9Us#jQ_B9$`qNSUNpsU2A{I9h
zilt|m6ELs$CX8rw6hw=&cUnuDf(&*|jl~6rDYRP^8P^WoX%Ba&G438`>P<A2@<-lg
z-GX1+C5>C at SGllk?c<-4QENn~APzg2TOEqVf*4p4yGclQG0vvIH`>VH1CDGAb`HWf
z`;8KZ7Q~pW1Tm&dcHA?HL;C_Ks~#1+jJ;nX{{#URIB0-F_c450-J95I7(ZSP(IwxL
z?=lgyhc9$l;aBG8e|fWddH^2M5>yC*KDuJxpz3ji{V6)F at 5`3;eOc at DyNuRzmcJ1H
zXE?u`d%d5xTxT at q6*9`rU&~4S1Z<yp?{$AT{Wu(6i(>(jrxqR&1IsQPQJh8nxn8LL
zPBs$D@{jJ!b=a%ymI(FvkGs_B-4E%*UB{5ZqWv7<j&{cmw;lrOw3oI124l^fC%Hn+
z7-fa||IB)I9M3uWWr3<z`V};J#c-2}7_TPxhfD)AS$)%vm2Rb;@`a}C!&deQl!1R@
zFP8xUAzw8j at 8P59*cBk0mA=%}&o05qvAL!k7IVnM4*L%u^5`FR$b*J)%%haus2VNH
zN)W)ZeW1y1*Wk0rym=|lEDrL^;(R-+qxz?E((u<!@_oi|8=hY6Gl6pGY17O*?8>AP
zKwiq8v)dLEBP}#Xgi%yN0pvw|N~8tAc=mHkYC#nl!l<P5NrbW4zuT**`Z>EyA=%%m
z`?1FO-neWTJNFtstvxwPae{%%=$kV0POAX}K&D<P`-N^pHPiA620`)VaW+#4qLn?Z
z_t{r1(+s6+_l|U)bf|x<059dw=4$p4OQVtQjaoYMv$9{>)42c6l;X1q{kZiGe>TAg
zHYXA+L*viQ{!RlqHBxjQ!#l6>nga;g4ts4djgg1b2!(Q20~PtAtl(NjXtYBa*TZ;3
zTsYi;<JceCQ9Yld(oxs#&6vkbDbr3AyuP3;t)KnMI at 1g{?0#0njnS#RCO>kI*NyF{
ze!fz6_=Ha>1b9kY_P+hHm$TB5c+qdej{0VQ%f at opR%V at -vBf$rdqJ(9DmLSheQP6_
zX5m(!7QEKx>UdT?d|D8$v#fu0;}xQ4>y at vilaVEaxGrR!s*AoW$k$gglrHlHH!^L)
znv(<W95(9FQ>Rm`Cb2Pt5@!T}JM#{f`kJ-NFGqMg+&Z#b?H-0hjfI*N3Eg&Qu{XmJ
zr307-qYkz7?{=tVgbuW`box?@5$2F6#5mQW*5FAVg&4<iwZozv|3jqe-}ol=Xpe1}
zgV~5F0PfH_ZNvn#lV;)(qD_}!D(!wQ8`fAt8avy6RAf_*YJV|FIt%s at x;4BMi)q4=
zEHX>90=Q_8s)>4$PX)WW)&bu*++X)q8-;3|TooTBlKE+ooli6858IMzk2Tj=m?l`|
zgijUI6sxBRG|*ODW*x=`5|iRhDbCO_8qi*Cvte^Kf6D5tM8Wm{Wth^tKcIo$b0uYA
zN?n(@WC|j}glP;FIW2#aW$MNfCN!xcl*+3~E+^)lqI2dm0$n<)vj_;CKg0wn`MpUX
z(WD$xJ^I&D-PhDI1xd4?Md_<-8(&acz}Vg;P9Mw$DA_As<&!>Ql{|%2h<D;z;%sVV
zHboZ}loyBL`bS08H3m`~*l5TU$Kc{jV$C2Lu at a9nDW2ai<=4JZo!(<5o`F^rQ;3=;
z)56I*H5jIQ9|mDp_LK<$_IaWE)JG9AG`|=5!diAw*|yL6itSxA*!0+u85GCyI24x)
z!ld8NWazYZ?H!H{8 at pjUyqt}T-5>ynx`BH~?8ZT~3kxvIzh>Q-6Im7ArI>v47;8jI
z at C!W9%|kuKFL!K5HC3dViXQb!%FHfv;(s{;#T|x6nYjwOBhKJEPzf at FgZ_2D7{Rz$
z)6}}kCa$t9(YJHrw|{~w*=fV4`s>=D9Uhllr*rWj%Wt<`lH+27`Sg#@x*boWj+b?I
z$D1`O{XFgGgl?|WaUA5YVb3jh1K4zpZp33J>IQ7Hnp8w<!oi02&E?Y;^EuLAaVMIu
zN0b3;2nkkhQB27u{+q;q6ABL9x8o6Kp~FX~EOHKhJo<tV7S~R3shAee#i%6gcoY<?
zLdbR)73W3%-{#u0Z=3!Z8yq<7xF+*4+!gE!Z;bL=>)*TITI7x*YDnacMlhPGkvpc*
zYR&olo2DoB2v*z$TNZ8|)I5XaATGud=468A(_s0~-JtnCWu2G_9~90>3O~`SbhFeK
z$HHZnJ0Sax>H64EJQS3mFMkGG75<H*F8?H=n9iZzIbz~4>T`Lj<K at P@O at Vp)c4ywE
zf_WKr&Y0^XEy1k{uO&wPa2|!grM<LcN68(=vA86eT8#8)_2o)SN-P)Wl{x?}x&o3&
zFnbx)TQGAD6?#?(A2;SCY%7MAAs)-Xj^%^Nc#w5K5DP<XrR*jCH13<DqI<3pMBxdp
z&=GwP at jT?yVGduo|Lca%|9`ome at c?+1w}{4_=!}n2&2W)l5_|2k<AWFD}_GF7Fe@)
zTL20NDib<ER33-1mH)|>&bvcG`4iwglJA*)tu4Qa#RXEGTf#O?%D>ZOxX?s|`uIka
zWor(CUCoK{S%Lyx;~P<TaOO6BS<#WghQ4(BcT{gkhn+M>bCd9o-TEIo-V}X}q8S|t
z0{oF(XPTrHwi>14cK<AQ;4o`&2hFn3L*QZH85XyS&_e+JU}0|JpkNIt?M6e0XKn~#
z1xk$SIaF>0+QyTS2=ULWHD)<iP- at e%KWYZaZZOl?Oo>ozNA>W1eMlnKYZjsKWa`d7
zsd7uzFIUEPU>ivKw)0;b|5f-;CstdAxykLrRwz~{>I73Y7b_9##Et^M{>0?Sey#Sx
zQrm&LEesfPm{5%USdpWTsfI(U+aY8{(0wq76~SgavU`*d`Pxy=uFUnxw%$-<@OCXM
z=`Jlmlz%D<Q~U6rxp`rGvBq`#Xa8Xhn{J|Zx{pSH^Zg(~iMQBIb>B<jz3594&7yPw
zp`!E(lGRGLkc`=<t*`>Wj)E4;K!Wz5{6n at yvAaD$fdJRVN4*ZlO0XjX2N5K!Fm^=O
zimU_m1DjwcE7tk$uqe-I6|sQa;lr&`RG<Y>^&OEtV2!c>P+^S}*@z{F%8TI9BEA|J
z^n+^a^h at Y|bpO&SE_#0zVWN)C9T*ax={3(`*2q{yjGhDUW&g`6Eai`xRa)$dv#;UU
z1CN_TxE_udX8RMRrTmiPNn0uVFWnQbVZ?_EM$hC}S7c(Xz-@=6SkiC@;L>WSH>;HR
zB?~;RWwt6JO{+lZHZ+QzWKaw<EY~e?BjFp80AT=PO8(g4X$#(<ENtPpY)Ogct5o|g
zL~EGh+C_ at Trn{KpxU&y2pTWGh08<_^T8#CN5 at W?WR~D=lJ3Z7~eCGkWz>zHcN(RET
z`=0DiMqA^{Y-f-OXt%QO4F${!1(>?G4Hef*W-GlFRUZ#G$P3v}N{bJN5Sha*wkalR
zAaAkwc)up?i!44KNs`7=q6|yeU^GG-b~y~R;Iqp<Y9>oW=LrbfBZ$O;_Tcg=av~$d
zbE5?9KGqUS3Fs9Hv)u<x9=XIfX{o7+3E3uaH-S6zhVIq?byP1J{;<qw#ke<xNG)0%
zBgLYO77Lk-hK2M_dz3KL7%)2=Tg~`Z at G^E9v~G)w+0m?P_0Txv?%$ef{csGX(Pj0*
z2Rq=3&~{@UiSvMojQp01WN~x$-^MmA3^lv6+FPahv51zA>i<)<{LhQl^2L-7S2hSU
ze;-gqUzD1r7`1iw=e=*8(F+ayBZhE~+Bz$CAT$*?RgizQ%<L{J>@T0#xNirZ0b`+q
zp{SbOn2W9>dynQ)nB=T=l+}emEpEiArIovB9Mpn+fk1l^Whosq(=G3me3t<gdpH>L
zt>Nr*R$2VOi_P3#*8gRlPQhR%|6C%*&u8kFV}z(UR5LoN=l-8C1k%TL;Ip4wh4BkH
zaesba%y2+bE=FnJ)1;$XMS8=_Cb!E->@obxUZZ-dj~vzrw3=swJxOT2gx!n(KQHyA
zUzg74Np&1rx3^&DlzKo{^V0I|i_*Jw%g-G<Zh+MuFD*)M4^Lc at x7PAa_57KKzx#00
z<{r*6Sylp^VeQ@*66xpjf1reFzGKJD6m<JvZrD*;HC8T%>BvWe`9JcQtSmiZs)gzA
zlyms at 2{kll1};pW2|x4g3)83L&k^P0*#8G*EiZGG{CT~v@#ClVL+#WguH at ej_u4Gr
z?zxaG-u<D-^ve;6nByVs4pgQU`MdBo78&#YCFG51U9G2*???y6LIUf2epR>*<{kB(
zO09?3c(u`#SXt|#eS_pna5EMG7?yiP(1qz!DjXW%;qZ9;iJ-S3bmdTRO$8EPvJ~LN
zQvS<?&zGdjRLX~yhuD3YE9Db4dR(ikruDSs<tuvls}R}`tIM<`)z at A5@;uqx)0CHo
zdddaGP&tv9NaY^pPHRsK{i9XQ`8L;dWqo0&$50~IqBTzlPRm4tXG~>TiTpzcrNNri
zWl9QNuCz}e7_Hj-X=`umiDMHcex=beS02jwI_wbbg%pFA*Ei5QyDbK_G2Ut-N~E=?
zG~C4UmBcR`(a0u=^i~H5VZbS|U1P)4Q}w(?6I%v*!T_U#dPr)6Q=wU{G|RQ-SJfkZ
zv?I`$KB<=b;Goc)bDjbU5B0PWcu;h5ZFJlQ!?C0mp@=X;ygnf$Fb-jSb&=wLd=jO6
zWrIjLe7_6wQ2_0=3iwH*j7rc<^?uwaHb$1!m<@S4l>Zdlu%z&VJ-n^ZOnQPzh at 6+j
zM*F4Rdn)vyDets&&^a~eD5mZ)<>tz(dYYz at lhv%_2ZZ-c7MzE2yM$dkezf<rxe{Rx
z^}AKW)0f(`T{V at S!L+RQ)R*@5 at s@!ad!+ZRe$vvA<Hu4#&gW>Fs%097&Z_jZjY`*M
zZ(Un#H#_(eGo|*die279=Fe@{_j<(Pw1C)1HXwm80 at o2TSgQ>3j#MIN1`4``K)Nrq
zC|66)@*;1&;-a3);1!wyu;kESj at 7M*O-8#yq%x+X=-XsP(`Oq|i84wn<x at zPTrtSM
zGy10uc~#$55Wb)=dV*k=t{L(+VJI|PN#S>suh{8^#_$**L+|##f~cA`Xd(JF^=qNk
zJdHX_G?>w$HC53VdS7nE5yjfIHl_?=)I at 8Bb2OKD4r}r}5%IMK3JlXj16O(Ef}UkP
zlnq^sTeQt4EmsPx!RSw;12j4`z})MYMU1U+DFjqU8M`i at QXD_pnN4kC8Gi4^GSm`S
zhVgB0S42^_8!I$eCNM={8H~uD(kPY*3?-&CR;S5<WyVotqisEmm+7F<vzMn#{XFB9
z=HrOT&uyh<yUltLWQHv)AWk9Kknm<#MajnCivgO%OU2CPXEC3llx8 at r9NJ|vp83~u
zB9-$sI2Fd-AmUEC!Ngw_|CEBrunbBW8;}l{7fLi8ZIhrceUo&GFPr`T*K~b*ZCKa8
zsHR7kReYKLc?n0+<twFHPb*|o3kQ3e>xc+VHKqp0yM<Y8^!Jxi+v3ZpuYPLx+OF?G
zd58#bC)&V at A6-CRnpZMYgIO^a*=9N{RZq=~@D-bLurDghrJCxD9t>X7O*lRT_i57P
zrEJ%=Y6s{;|5#+ToRP#OX89mewo6-;q$m+;r9lcR)#zMhu#V%UrUS#pEY1HX2$h(7
zxNj<}=vgxYP(9P?tg?D+3)y4s%;bTwI*a3Al|cvH8YL<Mer%%pK-KC<za%H~7qZiY
z{K-;HQ;p^_xp+oI+(FfBqmE9q;i$t&)=bg1;!9MgQ46J8PI?Ed#FVm^WNID^Z>JK6
zR8LjIz}RM7MoZW at 5`OKvl-hBvhk^qr6Gn=P<LJP*!6alZM+zl;<kI at 1$q|1^kkN8{
zg~Nq<PSza+M;f){i)(=o8pNcTJ%|e<EPXKD^PKpHuF-5>*&Y20|5wIf`<6#>i6gCv
zRHJH78+1hnnQX$*uq>e(*ufN=F{`rBIQprXY|ljkO;79$vqi(J!lE5vx6%rmlfy<W
zMeNYbEUOd$|1n<~kE<YlG+W_{977eREeut9E0XTpP(?fiLlu5D3$=oiuqKb#6lEs?
z3!U5Z)(~a-D#V3q!#K2(gKT4zQ|1E_*(9L!8zXPBa4*#(V+%VOvPL`ErLp=%%x-0e
zv23FyBgT-~ghj>}VvH10q~IP6j%Hqyv1q{^sa)abmDC04&?3{Y=wYr)^guSvSWW$g
z at ImZbG#arGgs>E$XTy|N-wV;+#Bel;K#G-Zx&gzkLWfYt!Pdogv>u0n3FMPLD%1p4
zc_3n#_2|T?`5M`yk*OC!eTVE(l^5A%-m}7z4TiK*glL)L^*li=pwX_Xd)ZrMLMN#2
zyl+*<wd82SypCpbPpKbQb-iszx;9x8RIgddtSj1v*A)m!xTarU<7Iq>cFShgaSL(>
zQ`Cx~F4+}UAvy#b6h2c1aw|iVhQGOCD6r;L+N=k8fGst`2E`~JVNfXDNP~iV?Dk+#
zU_FhRCTW-^*uITvLL=G&|Ec}?qBw|oUPCcb6ZNtpYs+Wui>Hh6gHs!8Kbb#yX?FS-
z3A{z{5AB&97SM|b#i9FFCW*b~Ztqg55IiL2STzch0fvWQMwXPaJeZ-jLNRMi_F5(1
z#%fPfkcJQT80SLV#@a|9zR{RNlg9E07EPBmMppBf*IFDYxIArZa;3)NNJ|VBN0Pqg
z&t(J6+=!zHCA2sa6Uc~}VeC(x(Uvde2T=6hY<Y$W?GIY%%F7`aJ0HWQycR}kURW^T
zp5rP*SS|Fz_)JYi;?P7(5ULj#72Cw6U2P^Erau!5)4vi-e<+B^(5Vwh+opw&nb)kB
zq~-R%0>fFW4cX2NK``!n;K))>n+VTb(qLGBV}<Iy>Q#jlhXJEoMK#b<c!?2FQYfUP
z^jNEyz917xEW1+oQ4h+%L^R33Y}l*HRESWDDMV-?h04Lev~af401cC_MH-M+YsIAz
zr}=cHoAGWenuHx_Sx(E0b4tR68l*3<1S7LF1$%FKr>inUnb*^5F^}XdQ%dJHR7sq_
zu$)MsEd-GO;)Ugk=C;_elO=s1W_ZoI6pp8j7^;*Z3s&@w?A?%Wl5`E`u0-b6AVwh;
z7X0c$Kw8AytOa0hj8eMBz*HK{T}I;5b^$21>MGXy5izlWq`n4o113;d!Q7R+Sg4l6
zLbY5hRB5`w2Ye_fFn4*F<fECJi7FIDs01z3RHuoscZ0dHO2t5I&WfGvV$7|P#sS=z
zyBwGsCKGcr*5zf61!-jNikQ1B=4L>|+ at yp4k+~5enAt(9n4}q5p->RjJA%2n6?2Ey
z1m?CJN@~mv7GMjgRH9whzp)<TX?NC>@)ql1Pl^wW_24kWuX<pn2G at be!gU;v$XKQB
z&UMINaUG=_av|c;3~i(aw$f@<UDS#ua2>B>rCl_Uyyd(@5)%1o%e%O4LlQ{h%1VeT
z8cxynM~{(_0g})yCLzxPKU;Z0<Vx=rT)7vKEA<PqRdA)2s{xOK^+j+Kxo}cYm7zm|
z$Y_5%7l{{=MUjhYk0Q4WS0WPj0-s1TDtEUl?N#JLe3P^-ff+R_Fe9woU`BXU%m^XH
zjCL_*Ebmj&;{6RXwltWLPnJYUYx6ir+LnSDP13dmNgJ4AM1|_K>K(z1+=>}PYXUP`
z6P1*XVhTy=+9Xex4a-5w**7zSjlm0&gKo@=3BZzAF(ftt*j5NY%8smPc$-*}WC%dX
zyap?ZFV#k|qP-r;ij3|^RwT`ytOycj)@xu+I$W?Kny0~vFNTZ;mTcgnEN}mJac2VN
z)^*+Y_W;Za4he!Y9Fjwk50EHR3m3<VB+DBgMN*;`k|ibCO&4T1z>t^)7y_6fX-nor
za<(RET4%M>#Im~Bv1Lcj-o{Q8H%-%4X`R+-y11#G)=t_cjpDD1zb|p!-~ZhE9)Ovl
zM8{4dn}fI9<=nHMbC+lO(OK5(WJZnmoS*}@=>h}aGIQ`ok`jeE%|q3oMF2j}f_roe
z^B1cO#?702ry~6P3uEEXHHZ^9`Mfe$4yp5`Au59al>?^Fd7mK at h>&J1Jsw8<PpHf=
z(;{8(13{)W at WZKLN{3qZ(tvUic*YwHD$7#S+6v~d>j4j%2h7_|xcRwt9&k@)oyZ%{
z5)YVi5(XNB841A!80QG8b2T$)-<;9)u$sgJCR9)bYO6w4YBGcwlp|+AjFGa1GMba6
z1M+}#N~z(7MH$i$z2(e5#$sA=2 at MNYVJHjf&|1uE5CIT~kf&#)<|e|_6mh|WH~B)h
z#c^BYF5tmytA;V7;dnQGNY`CJ37gI~;BjKtx(hjl2&v*`whhHiUcCSkkQN3^9g`OF
z2TpM(gXM+rQgC=dT7(nBh!{fy*J~I+t?0GO*J{RsfriPPi2yfl$-pgKWW^okF&Qy6
zHNP$(BRK|ATyPd9*yv#$VkkL99&hf_c*@eD)`Q)qJ$NZl*ACUL2Y21m1Dq;yFp&s-
zm=<u1%eA*3ugIi~{A*xRYC9&iX7IBVP;d>Crt-8guV(vrkSd;66*c|ocr{_p2-G-)
z=*k;i%VYL(rOi@(owzc4J+4fKwT>$_`d8%2btC9wB-khwK;xc`!n2H6SX6KxEba%`
z=n2noZBkz$LzWL;0l6HOtB>-KNknr<#e}Al_<_lW5%0a0 at ZTsp%Qy!XL2@%K`1)3P
zcN98TA=6a?3h^9#Cg><7iaJLIEl9^2Bpj7`4H$fR4X6_blt2{Fl*5IkPvWzTb73vb
zhZZugVNfcu8=qz5!lESP!cq>M;9XdVWgjDxtEBGfsoUWu->*89M^*MH@=HN65Dqr8
zY at uNaJt*vdTJw0yG;_{#6taBT9^~`$8Lbc$4z$@8otd%0d?X4#^3HPh1HeWzEhXUz
z)~Sm6JsN$-2jbv?sgiSw-I9T_H+fIa9SESWC9`RcUC_TL%eV^!<-a)0ssh(h6uN50
zHeq~3jdz2+7$w4-a_HxjL(M4zi7ctbcxZ8wK*ZGsw<ej*m}>q_pATXRyydUvHFnET
zgXl-c+~D|p8kTH;#f<<%aXQzJtS_ at NW?&SSU24S{$Zk at jL|N#4X`+bKjTUnjzFQ;I
z>NlDjQioj??L%WtNYeuuW?8S^#*Mo&kR?^+4iwBoqQl`4z(7vA9dtExqLf_+jM`Ar
z9gUo?z7A!X+-O{@LIkiGtFaeHBQxVFHXy>+l;uDd5tI#O(tQ*Gzrb at af+*c}m}tzK
z3XFk}Rc2~aG}5%T`Wtab=V!h&Y7h^5Gog8Fy3q7?Ip5>U44W^*+f4#?oiCH`X7iAC
zmaF2+*r230aOF%yslCH}>q$UuDieXqP`EBdH*>r&6jXA;%>pIWGzf=eDio8JLszL#
z=5!&x8!J=cCznw)47Il$UlZf47MxLBZVjk{r)U(s;nM4qQ?7Zz(Vxx$&*q<i4zc0z
zTK~ixxYR${Wc~@?5$4;yvIMS<`6t6#I;FspE(~fK(-09c6~h71;B08}Pc(u=_t<M{
zj=HeYbHqrOBbZ9-<p^=<#w^DI0xv*Jfr$fA^PY_WW3*3-ympWSDO~^uBl|H4e8Gr|
zdx<?5u#z#Z=zMa at 9RgnpS;Azlcodoi$$x%RICw2WP4bmR!!u-7I@$bhWtBxjo at o}3
zGA2H*oFyQ|6((%X<4$tUJhj}|1|GK>Q^py8JL6QZ8NMX=vK!5GQOx-)R(`N&l|#0d
z6)`<xMW#o at iZ&T%Q_<#MTjp3KB at 2}WMZG>tw#UD=OtiiA39TOgnwVUK042`9mb&nH
zDug<fWHZ7jIkcenuLYCk%zeb5{E|*->36%SBDU^zyG29<*o_J8x|Xf4>vmg^x{=#W
z7ls3~)GkZs{OTk~wJe>v-9YJkcKH&~I&&mJf%mf^T(1(B9bM;Vho-Zgj!v-x4XEk0
z^DFt;)<ai|s|UMt9OHd!>%lA11n;7N6HdvQ)T!fIuAQGP=ZRp at rbE^TgS_?338aE`
zW9Hn5v;>3Hc06m%U`V7st;1K~S&IoXQi>_hQpM<n4){~?EGj-_(k?=Dbv-}Zw9VIv
z4KvsCvy)-1V?&MJR7 at KE^|IkcBj{r!2$c$=cbFWlL+BrWMxLFe6*5OsiXw1mW%IM8
z>1Y5KG4->vB|DW2K^!h&=23viF7n8a5 at C{L0mYs9*~(aiX-Y1PZ{<!YxeXG<Ryj?{
z^(KmQ4!W+FO`_O*0?Ta*Lij25vVm8gEZW0?BmiM}oQ?H)VzJdaC!25M;%hl48*Q&V
zagUP?rM#1kWsyec?N4J at xAbQ|wnvY_33{B5Z50wM<@2u#OpOp<{AIAc+r=@Uu<*n3
z4V98MpVk_-B`Wc?{7;1}z;&ys<HO95++qnOYbSg3+T*Iy^ZjGjq$OtVbdkw^$O0Jg
z{<&CgWsXU<!h5Vcm#9X#UXr5aubgKJ49^vQPz8AQtE94`+<~BgV<t}?{NuoR(0pIo
zcZ<i8ix1B^*zn==459FTH4@~1GfWp6Yn5=&9e~%WN|%AKO>>2JyKZF+SiOc~^JaIG
z|N62|5arx1YrV!C6FmHFs|V_-2R7Moxf+xp2x`NF#9?Nv2r7K91Br2Ma7jaUoLbM@
z<PnX!SKT(YGDIF5HQ at T}rM!Pa7psalyHB28_vD%GlPyBkySn$jm2X}5<lELg`S$LU
zNrOiEj?Dq=(pv)oOY4&-+kD9ao0O7g6KU;&9ogid?|I4+_YKN&!dUWjr)wc}yX*?~
zbqUwXzNGDpdnX7Zx4GO-i*colVeCwmApf3BCUY(GPS5iZ%g*{)29BlUm%~f?S(=M?
zY6bBK(;2a)@^kUi1DD3aqaXkB|9eWy;$tS7#JRmvqBS5#fAPy-dCE$;2BS+;<$<HW
z{Pb$(UBG93tcGyC&DW!`Qn|2s73s>ZBxp*FWuQ5ewM=}i9q+JZoG at p+X(j&3Ue+|<
zmffLv@)>C`_k{WF!n5K?Wj>MPRJJUt!rf95IU7d_6X)_bqd<)9-e=Va6}-Yg3nvQW
zH9_IW^;RVdKVaoHR4nfg!{k7C$z>0cWAM7_B6YfF4c-TBUH$X+H!QqcZ`~}S(9-`N
zm09?(It6wx0u;buYqg}P?o5{6DKp`kr2=B~oK-=MD5d7ejT%`4Vy4Rny&!_(jX|_2
zC9t~YeHTO$Z3&23tr=SjVs=9iGrb_buOEwlfgl$CZi2%;0CNF4__JZUSE)}2Yx&~O
zJF=Sn at 4LhPi+Wj6FpQ+73-47I#;ss=*u^g4eXa=~b{q(Xare-~c#!DuuI4sX%_6 at a
zQ5)n>Fg6ro{fKtGAt;3p*$W~If2t*RTwbfzCvB{|fhl2@{10Uqp)p$!XXZ9%!VqS{
zKm?g1a(7<PAj=tDeq5JEYy4VWYpw+|m%;wWSk9?(VqXSeh-=t)W|w%I?fB at CDF?~E
zwZkQO18at%xzjiu%7nCX_COHCX9Z56Sxq+{rA+KfyWT at LOPh-ivFwuP89!pGXWjAr
zwkrlOq#vkuw;PN3aDv_-Q*@0pg7;<%45Sna9~3%+{9o(34 at kLpA5g(^--NHYN7zG%
zl9RLqUdX)!@x1W7n$rS>T#EE4oBu?H&32JehNY}iJ6Qk+OgHpHxc-Xua at JEyhXGS}
z3Hk>S!Xc`m9F=-hIKt9h`CcQ=WccLIkXso#>ek|_#b{IkqHxS7=M00OmKo~N!WWE}
zSm7zi*!Zc04qdS7+gu7g-4qdCTOQJ&R86~O&6OUI=rpI-raOTq$0GkxDP*`pnGn>4
zvNNelGfu&_&8h8xzA#5jvH%=%U_FTA<1gzFhA69(A)=xDsenPCo!<f{cg$&bo`brN
zuRUVZIlNjqf^(QCsAR2+b?E9PwBAW`VvOgtpVP`0enN=I=U=?86Glgy<bEOD-)~4W
zkR-mcE-#vZgqTaiw<J<-7vSM)b!8g?juWW~;p0SvdtE6#{XX>|3itwOekU`%0#0L`
z#;YL!Rx04{kqRq(R1FAc at tX?&UDslK6J`mBk~fi*39EIpBCRyKM~9a)v6T%#tINcf
zrruO1Tlk-D{JLh8RFB`pDqQpLgOaB3ZM6<)0Ue4mlr190%zsq6$AJb6(k6YK>X~8$
za#F&a)g$WH!&2zG*V^iC_ZDp6gA{&J5J{R%rl<j$0nOTi4fU95h(U<gljo-MUlG~L
zN`T7QDtwn=YJhd!N_%>;0Za(GEa{=;SA+fwiHs~E3Zsrq!C8Cm(h&xgNYAz+-BKlh
zWf at BvbiRz$^aG$pMv!>d*YZCrQaUWXWPrLVq(qF{P_};Eyi4R6fsszQBEY3FhA~+|
z5>3VXcc9c%D^votQYE&q9*{(E(Zt$EVS+Oe)8j=LU~DKvd(UMC(vr*(G_=hsQ+1us
ze=Njf${e9VKaG*_AezH8hE#~>Jdi8xz+q?JFB8PVj`~ud<<8-y6b~7j({yFl2xk(T
z3G^o8#5FMI?-~ucVp#E;WHSlgfy{vQq*Fm!{+Nn7)_ff8?=bzp{@7T8Hb0{)(Z^k}
z<53TQmg))vALay(PW-7K=i%I;Fo=KpM|Guq{&?ZzYC8Yub!-Z;p$;4YlUf`A(pXDm
zk#qclX{6u#5o)KW9VA-oXF>u*2jr)W9fVzx>(j;#VfF~1F(E}Zm<cI8%RtgNd~ht#
ztNj_PWn`sD{H{=06mE0+-D>8I=F$eGNxMxk;@}}OQobEx**G%nM7}p6MiPfqx}H!g
z98vy$#|<d}rZ4}U1DWpZ%GZF#-R?7Sgk19EU9eda$vRRSXFwPa3QVVwCyj7T4H(mK
zV3d~#h8nTUPe?n&7DZWA3&Xcslo;eU2Zf&%@e03TB%JKKg=u_foMW*0O=5+x at C#P*
z-I)wLbp3G5*jnV-oEr$jj7?sL-WiOt-Kfb4p at w4;q3vbFL at XOR`eN`5fm-%ZI9duJ
zZSLsPtKl<e%=iuP1Fa$AvtbKgkd(OHMY^#(celi<mC$Mwm47i%5p`m$087#|7=)6F
zz6w92!5Phv1<E;0JJ9+>S22R=l{5|u^RwO1iTE_^#wEgAcgV6oZz%Byr#RMbTwN97
zTjXnpHis&T(=)7PJqz}<n at s*&vna}L9GvO)6Msy*&ZF?oK<-7Z at Oh0!Qe!)r6Rl&D
zQt_7(LoJ0u0XM6p$T;A0L~iEQLf0_9!|foK*@%a6hO_Y(NvGQuROzWO{(Rs{AfaML
zWzmD!C1?wu5DdFCL~NU>HFL{8C;Log=6mrgtq?rxRyNV1@{J%sp0{rSgNr8!HC?|_
z<vQ_~4b4n5DXRsxbZTOb&ru)^N^o%$jV2C*!Y>L20hfzMsU&X|hijD at w{U5ZtG4~;
zVvcOnQ%=>1EnQ at qtMFT@E+;S3B||Q+miK4wE&M&b&0p%$z8f~fBRQwv<7=oBFxy9i
z2vR9IgR52eJ8ETZ#IO23>gND&+4aQnVd>aXpSsODE)ZV}XtRs(wlGXJcE{ts$hh_P
zqwt0?y#z5+gXHG$F|j$Bik=QL`9BH6frRh6V{~1aiQ(h|ZRoIUFo!RQP?~*G^bkD2
zPAQv+24$K%2QVF|&05)PoGJXW!?4C`o6icL^eBfjSpq>}ngD|gMoKr1nBmV&iARi?
zA-f<h+pbwYuAp|T<bK~NQG`~K!xtv+0II%88=K~Y$(RX2S4D$xshyaT(fY5R(GTXU
z8=>1HI0TEt8dShv{?Q&v;_0qM at lo`K(Nw4Jm<CB{8d|at1c at yQzoK@&X1 at Bd9%#}`
z(kt at SFJxGU=+#RfnR^%uszQeyT6y3pKh8H0M0_UeoYQH-VZ{Aa0|8H2iwA5MOxAmH
z*as7fQi!M{cO>muWRpdEcru$7V3qy}Qo67aG at f_r8$ZHsAvu%g3Jq;twFHV)6w&b1
ze0d0JOX4 at 7M24sEkIdVK3-oxTglj91nLHK3*(MwfvyrRdfkiptM|&P(l$uW9XIww3
zaVL$lLT?=baR`4{VuI4KlZrJmmEW}0(!J>Q%#ldppHp=~;n#$m!f)!Yt?s1kVIP#J
z-d%TR?vm2fr3)0$(40taiWEsmykJBv;Fxtaqq1k5KMWKukP?Hp^1qicmnw25A<P_+
zPw^$Wc}PQgod8Z3diLxK-S>9w-qRj~&m3`}n<GRc*&_~C at C199_lwdqkfN;sw;BX&
ztny{(^O~s}chBlY)fz6_3Ovb8cK-Po{4tfV)sok&1VSHBs-IW1E=$VUfu!<x**g9N
zAA}jN4wb5n3!<cOu54Yi4A438h~#>3HUuRO^5wwwcyE~&o#w|%d7oeNA_1rFWUs|w
z<XPvr=yH`FVGZmc_O+auqIGY2r!eQdZb|O2Z_e6Il|gL08nu+2;$k(^ieS(9|Dw<N
z`CM?KMK9**cm0B1uOPKXE?u0&$%U7`1ws}6f&S8NfJd!%-MKZkE;n#SPY3N=sC&;w
zx>1^6#y}F2Df}n32JL!uYDS at x){mh*vu+VrVjeBR37=2aJIhBh*02~I+orqR{PRya
zk3yu=lj|^C4{7`9MbFY+#&!vdxk)(X_iSr`qAXet;wH;^!A1VLkZFkUoqBi+ at j%!W
zXr=`uMj?(*7a+gTjkd&}o{h`dqro|OKZz*7WM+>r(0=w$%9Upfl<8kFj450inScIC
zrQ5O`g0s3hOe at IOn^M+NO5g+eXR;>eU%hC0Ik`dBPvtCgM2g~1q)2QjpYZz1Bmuj!
z*G9)zmN3`0DXBlBwoR#h-P_LpRb;|3&l?eFd<(g>Z*-`Uh-+6ayoI3Z1D>XM`dDti
zjO&JG2QK7eQv)vN;MNU at 4X|jL!oQZ7!8UsA-ECcumj~vfP<xwU&d6RT`q(mL_*g0i
z9T9e#!YV3cC}p!pC~aF+vbaj6B!hyxR31WA#iKHmzR~PFMKGu|lr&z(%4#)b76g=J
znrQ<V0LX2j1ePR8fhQ`SQ6xJq)2t=bc)352(HbmdXTiJDIC^&Ihe4T52du|#QjVBW
z10I^!K6vv^oO_tSWbPrgop(%GMtzEcet#zy`eYUF`8SX~0%iz2=6nHV$`}jl2-*8R
z(!~JcX~l#AeBK at mu#L~wlQR8BI6LAnAP7rr9Uh|EK!O2V&&?;U4Fl{X)&UOmj_WG@
zm%)sNQwTV?Y2=5|_kRMZ^O*g62tBa$=0jK(5TAj1mJc+&0?p+zKuLas#7oG(zE5lC
zd;sC*0DucFA;I)Jrng1+)J at Iie>OwaICKsRY^>hmuLKVRCiQTjW3#!!?EZ`;mVBAj
zYfR7nk_<d*i_ZeMz#63XR#RKH4~>WMTc{AW?hAmOyTO(CMcoly`pg;D^930XUalUY
z?2BykFz>%x)mUB=uAjB4NkeJNG3Tgfj;MjvC+C=Wev2PR+P8=s#>JKR<qaE1+wZo>
z34c8URIWfeJHJDiGVe?Fg*Jo2zZZV<-y3E^j$75+Pn0u9?_!bkS2%5eby-`$6^~hk
zr<cb9|KE8VvB+tQCX*L0-~SvE^&gF9v~N5RA2CN*>0$}u(^u>^lYB?!+gGnWW5xt~
z^732W^-In>xy7`XpJ-v~8s=k|v3MdZd|K^4FqQvq*%==xd<tJW+Z%(vq|TlttcvF3
z(<6xS&xh&d3+bg^evq>IeU at K24Zjv<j)-{-;9nz;mE{{P`b&F--&XG?vY^%0{KI6Z
zE8&Na1I0d0%*7R%Y$)sgr<Zq%L1{_}1Dfa!IP94F2q;M%5S~f)RimJUEnX_}<yET@
z4p24(OYTrEjxLoLKxBD+x?vo&l&Bvz{;OvI&>Z3Z6b{pgN&ExGb*_~O26dSuF!O<x
zfB*w at z?oMtRQ#!M&f`4D|L+q6yU56ePYz!TpY&9Dd61aKml%kDg`;8kQWMIJHeL?q
z=9#=2h=2c^bucQM|8*>+E5zr<M!5v%(%Vvbx*Tqm*kfpyBP~wac4bHBI81SXBT(F@
zXh7Un{uLe4vqvKLMX;^N727h~4`B>A#+M_K6S{PN?HJW^QD$*lDU{~b|DvKoYxlSq
z at 6#r(kQ0@|QqgpCk;&3{pIr2!iwwu5VaFq$>qAmS66a;gMiW-ZP0^Ib^L8S08Av=_
z<?)Bcm<;qOeChQ_sUv;NEGYaNaU8o4iet*e(R>k>*iLB at ZDDybJ~LOIu%#x-a%JLM
zq^?;%M>&<qat>_B*^(2b?P4n}rJjtxSsB{BD4IBo!VcHp`5QTN2#RW7pJ&`5+(kQD
z94<LgE-ItvYehPG#uw|Utbo;;zMbwJ;($)Dg$_1h#KA`JtZ5qM!a$n<1yL%T?J$uL
zuRf&|xqZFDK0s99<P6uJ5M=&A0RVUEkPEZg0bS%wtH|b1F(~n&n4OAza3Z3e^Cgb6
zMm3Xr-9dU*bq!Jnn@;;FmL~iQBkIohrT6^Q7i3Rh_?_|SLU{s*0CY at vay>jnlLEQq
zP*I)`uH9fqWp07t!LiLs47*Ygj0#4(o^rvaF-_UQLRKBj+qwb<DO470s9QNg$aA9d
z50)FITT9`P?g6Mw;XjK(sNt6UQH{iaD^OThV3W0wO$zK-SHRZ#b6T};AzD{p+*%k&
z3hZB3fV>Pa3?>B*uPd<I3JfI$?p;?PvI69%+$dhuL{BnKjq(=W!&XXvh?OL*xIZ?<
zqQ!S`11zy$z9FnH3`$}4He!!;yp_q)q!1iUNbC+ulqZO+_z)$X>#-BEl&!hX^sqIx
zvKz4VgkmWeMuq#>dQw9w0Of?Os-OeM()Qf5vnoqACdAH at dE}L~TYFawBI=1xsAqyd
z6_4oL3O at 7##A<^d_KKAzvPa-K1{Uq^W#-*t<}hN5g+g?v6Jh2_;c1r*l~c+mX+gc9
zxF(`0jdd!9s}XR}jHtK-e=+{nYjg1lal0ucDb|VjzJ<B?Cq4u_&VkJ`iVD50s=_e9
z6yqt=m8_BEZo3Qab6lVgqN2vEWi8v4QR6!z_hj!yV^YweXzQlUvIVe9F<}Nx9dDPQ
z+v0tc<?%5#rtDy3 at 3>Un&3DC2?lD)W%UAm!!?-plMOVah(?yZ<gkm-(YFtz0n>b2w
zZu7l`;#jdzC=|Q at ZPH~nxi1ul2lRe`2P0uFQ!I`a1HBhvreY3$T at 5&Gm#>A~TiT0$
z;HkG&33wn8gR^-!VAd*k)4<?Oj9Dqdtzc)9J5t);OeJrg$G_QxfpF#C5C)reCO7!1
zo7f}MZ2 at +2H&$u8m}^-7#08)dBfDK6)V`??UPvH_%X`nrI&LX#Lt92S$<dh59c-~p
zQOUMw?+1jm7eDG5AexY~9PK^2<q0(0-lMti;FAeO*g-vEZ?ZI!TXtBP<12}o(v&90
zXtLX*>#(njp-b^$-|&!YIGZ%w2e$7e!#51J at C5`2ix!`ZiVrbX$5?JuwqiLNI0~ds
zmG{DRNJC01rv<jhs|A-|;mx-=DoWa()}<Q&z2PWKO_lZ-w=-PjNDAM?0QQz5!y2OB
zk1h(0u9l;tqwjp`1L>F~mhTHSu*jn;A4MN97o>{@@#km*!#}z(?`sw at v@+LJ<l|>%
zj(3S(lM*wIk=<@$!P)S;rb at e01Ro8~qCj+rG+!_fc>F6$Wjdrqdy)U5$J!2ihs|QQ
z9sKad;GSNt25URWY`X?V^RY!&sIcS%OVWJeJ?QWiTX0mIDs9W-<nYaBr;Czb_C)=W
zY+8AToSk05K@~p40f9IZi24xr(^{m at No~aea1?^=zSnUSg7MAF0da+?)cl-&OYo{R
z9;I>g#xqAi>NsGQC*0bT0kkjQrTz~F0Itx3Ww6O7L1CPEg=XVY?VPjEcSVF46d;|V
zbjH9Lz&<!ODwPGpyqLRWRk8dmnxb!f-Z<jR55 at WNc)XgSCANK*C#KwN=PqjTnPl at o
zEJbfFpsPjUt{+IXch1p*ZKTjNfecr-FurA4mL}!UGS+diPBKbh=^Z1VL at DF4K5R>G
zdRtSWZes{JBi}~*MY9GpZt7M3=3n3#)wlj3)kSxDTbJuOvu at xHRBee76ZI^Z5h{Jc
z34H0jr4l?_20J&$f5T+ at mUYtZ-hNN|HvU^C@!zGtPU6c`rCXx%@zTvAJ8esR?Y|%m
zY`^psC0wJjd|epMm_tW=0rt3O4c17AHO8f-ptk<p0BcaIk2R<UYba!aQ`fQv)a(f-
z0K-DNqGKI~kW$)Sl!_XUdmI0pj=_&VZqxhH`{m+P{5|@hNc at sLD^5kFsC0fR&ZdoY
zg+Q*i<B`9x!NRlPHrko;h+_n};`f6&m#`=^fM?64V}bu~S|Tg$vH(7>p*<F4 at ml45
z%xw6>NYnchV?*&E3PPH83P(z{Fp|=DU~mS at NcFxFNk1>d&y`A2S<(#wPAWo%#<_J0
zkEr~bx$+(ft^?LX_HsNECAEWK)}pr3hhcs5{ZR9WryMD-rb<!q=ihHtYrp>M at pC-`
zB!>fypz*diR047weYl4}2&nlnOLG(NZem0-$K%?;jw_^O*S3$8qN71+FNLr}-$-xK
zw_(b}^;@2Kvdm|Zj;?<H|M*OHwd^n6`P7>!RP;@d7tP};xS)DM4^gJT3^=qnu3og=
z+=@#xqrsfb0j;v*!X}O=Up0ztnUq)!rLGD3UGdJjc%&g8c|R^Npi0+{0R0H#@Q%Le
zFXvs43vC8#G`UjRO}sbRQL1zbrLZlUFty3|lcYE?kSozPbIkcx_@!t!BsMHdcS<4p
z7Nc!4y4V&`p(vFJQ5+}nnOg_^9(`i^xV&D5zAyfQ*0YR9`<)w1-SPQ2vmEUgyK0bR
z#VuG- at D8e!SEdD(iANS>@&(^EbJF8cgL#KAumj6FE{}g)7#Eg^5^#!Lhx597fSccC
zwr}bZc%tJyzR~#dobg1wO58-#Ba6iUu;gu?Rm1q71(D;@Tjx+hByor>Ou5OLk1Whn
zen?Y~c$f3#{qYdrqVff>;d8N$muA46g)r$s#;kqvw+<~p)=8amfHwlsg-h`SqcD2Q
z%d{7}h1mVZzq at Vp8f&AxFB(_ZYC9^jkIr%UKAZ0L0(9j5Qi%qQ=vv-Rup at 5U4Ij#o
zY8oxf9}7(lsBGTX`|Mj12}~}&^cL%7%0K6Jl7_>T at _wR*$!I?(Kim$|Zw8uM)bs&q
z!Q0=<_ito+^7IU at jGY72=k3o%B`F;}l|hDmsO$k1wuYly`8XaH#1Mk&$D&!0XhIy}
z1j!ys35aK50`XfwD^F4nR{F~KA(pZYJOY8l6MqUu0#`)XVi!1-z(JoVz#EEQGoQFq
zNJf$PYvNHz5z?*oE)188fMxP*q?-b6g#sldJ at F=pbFaacAucLfV8=r|ddEYT%HsrK
zbA-6p*ow~eP5R=Ps+IS{V3!p^;s1d-IuI}T&!XU*#iL7<fG-he5_X6q2}OCcPArxT
z65w-!B-4Bo!5_<JpcGa1^N}uwIfpc01SHMDLjr79RFs}~Dp!SeK(XtW%QK`eqZ^0Z
zA<vZG-zwic_0ZTbmp3DXERUKi?K6xd{g-cLKoV6cRV6M*!39qQfhl_{EzfiiWDl04
z$SmI7yId}|aWNUDf5W)P6`mQv6f}UAG-y!ZCTb~mQ0QNbpo!^<L3C#vN~w-Vo4KZe
z_%OU_dDr=PWzM^;VkMl8*F-ld)>BXMf%pzV`rwqF48j-StUD5%GHzu|%IWBac~qV&
zO+f+gId7-qozYakd&Ih$9`|Ux$8%=!#Cp&9Yjlq!v*uU#I!98YfVgV2VtoJ}7QBCM
z07kbGKFf|wM0*~dD(@&wI!D3yVp3vYp8v3^3rm|Dk3#7+VWTL9ZZHmz$;09}{ebeA
zfZF8Vkr_rk8QBVlaS~T?=D{)h7Q?tUT-mYtBjf at oVbfm&@!|V|w?&(bQC$-e=20Yf
zkQ6!Aw+J^S?ZdALH_=g$BEA at n@v7 at nBGG7`$!HHrFA?Hq1h`w=?PEcE^u#|DLZ8T%
z?eby+l)I%)-F<?z$f1nqV<2P9`9b{XcAWrW$x;ED5c)96F8JOC`V at R4Gd5Bk%${5m
zOvxEZN4 at co$ANc5{~ny`j)VKlrnubjcPj@^k%vS#{7sl&&OS{82L2Bdd!JSKs-(7C
z^~1^*ok)#NOA{N4<)iK7`X)<ch;|@=_)1VPnncI(b at Ws=2gKLWE^))oXu=dA(-i|?
zcDD<1cY`(F3CT&2iwoBm<ZkZ~RU~`8sN(+iMKWJqklW>$J%HLneu3yjwGTiLwFbbc
z9stpRUvB`xYZpC-QTy0^hM434cJ%ZP`+U87 at 1h5Xql+G+AEqwnsHX+aE=92xIbXLF
zh3^81NN5C$!bo_c__%V#U~v85Rq!{_X<j%Hb{JUVnCLUjfcQ*UuT1u;G at Rg6CZ#RX
zsv`)LjAmC^IcIS)ny?(TD=F8|sz|upRU)_UF3~Do9Ft~Dho_Vfwq5EIUxjh9L4-cj
z!s}I*X*_eJ(?WENR?xqGoj{m?@Ozc$b%LTGuM2j?crLgsP8W3hj9z0lxd%p>Tm`kx
z;A+4x`+UTokv;QWFtC&lz{)LnU%NW-UZNU4AF>CWO~!egRhhJSlSHOS98pHwd5`<>
zk<#rV;bhza$2|{W5kK2 at d;0V_{BGw5fuR^@4i`zhHyOXfg(FnzDc!C2wOr9>QPGT&
z-s)_ROq+`X2m+$p-~TL;fQDfH|LyN5EHDFZd50b1SIuE`eY%wYnM}u?$N7~1do#hk
z{NHQR9sm2POvQKk|8z7eH at +NaS_qy`?HA?O at tN9q%bXlJo6YHS?aXzTM~IK$%%q#m
zh)VDANc(I&qfSd7G6xl1yt;Zt^WSjvx at iKUvY&IQiXb?AguDp&5*2Y=tp~hO+;Se`
zpFlNRXwl7tOiQkc at j<QC(X_jI>AqF6!};Lw{;qj~`P~p at KAL=x34J14km)<2H~F)?
znc~^x6tfz{9zBjJIa((tX6-hVlgS=+PGGWLoqxR?(>>_QM1sU#6N$BZ702|?4M4AL
z`+y|8l_YDcoYyq+ThgTGpPN^&uq{QNP?w}B|H6U}d>6S|q$$CVRt1vvasC@?Y+>MQ
zrLC4L9GtsFk`zQ+^FNg{^(dwB%Ku|}^YYy0;?{s!$?U-3(D2A;e$&|IEd|7W%`%E0
z3h&A=`|6IbV_jsBuCU-0>Avf|y75}^&>XoRn$^5a5Yep7_io(dH-y*i at padKbvU?g
zt=IOy`l^6gGyGSr`>F+car?C(@Q!TodXcQ~Il2CXAnU0LCB;Yi9}Bx{FIPD{bilC_
z3WPP6in!U<#_W~kGYa*L#dQ);{W)3AdF^Iv&)L4WM!jWJvMX2A!HTeyaNU#y?e&yn
zb6v)>b$)jcfBwA;!tKy4d`1J>mF&#@$*{ZZZ&kZ~ye#D4qFns{w9Lac2X+t_J%P6f
z(O4MF%GHP2X?8`=aCBY3$-ZU3k?#j2QMM0{0{=^!_>Qrtgs)8a{d{02*)WtYd3XM!
z9Tt)kp8P%okPilX7RRx<T$^(CXO7t+u(n!|MYuT@*74Bo9w at U|PP@;B^bt}SO>Gex
z>RX-5el<1{NnmWTu6><^ReWyCDgCoMpUSrIkL56Hmi9u-{>lSvMbaW?ci0I+s7-S$
zK-8*le)!Tu{)Ldep7=$9)<uMIi?am?Gcn0oP#eaP9A#Z*;<qcQh<B1pv=FZ!pWB?x
z1R>bSDoD=iHFKqLXqyRXk_9|G%NS<6E5)+%m;=3_=M14X8>Q5*Rj}@@bF1LirBDhO
zSaCg-|DrY->HYyN4|e~fjyu&?%aQ3!jy2%1%t8rSfQr~OriBNrd(=@Ms_TZS^Le!m
zLdOzBxwM)-gT_JbJ!M(u_ljko>M5Ij#j<QVW$ii5lPq<hh{*s at fPzUkN!<&Jtf-DS
zc0-E4<>_|~FO&js$4V)N*DmYqCmQ_9m;d<k9BY&*aD~&SAiVW7;CiyGFxExcZ`YFW
z54Nl;+b7JjVk$u#g>k}^8K at wC5|l$O at Eth})3<QU at _QDyv3gXYg<m8j?ty%U>5M<L
zg68-RD`U%lVaN_)5ZNda!Bf<8B%tLm;+6*GjH&@oCBRYRcDSkz>jy<}i%N!a21O_<
ziBG7KkV?m}_Lt1u7l@=!xspuNsw9M3Uov}Nz#?IAW^Eb<wcrc}vgNY$_a~#kXO>Jw
z)ROHC$bd at v)ea^bA6t=>w$CPe(6aH8P6{YU?ShfpFtd=)u7>$=6_s^HDwwxZc{4&7
zrA2p|5Vbd|qh|&Cs*|pE+=FoESQ2gR at Poq8TF{K0kZS=-#@1`~G-gXCv at hE}AHg6v
z!@wpT>Cf2{v7ef9{3JIt!^wQd-7pRX!5csa`!Zdp{IDuA<EB~Eji)F05?0**qHG*7
zFhi3Fy*Zl=LiQ`TF0DN}&CKl&QP?mtH-mGx;R<Bftw7X^wiwV)7;k1T$w0hWB}@GB
zK^M^Gnh_VNCd&o`nnAv~8JN2b-yoyGuu53X at Ds*k*36hn(##f`$r!i*YD8qWa^zP;
z(uyCFx(UL&18e!VVBnsBW}|^a!PpyuF<l)9w%iwNp(`k=uJ}RrvP^7n+II?P8Qv76
z5PL;js;gnH9KcxzJv$_5V8}twcR at IQ+?wPk1YRLwi-&$poP%v~pKFB{_+E^eK4 at 39
z<doUwN<eX>=9zG0MIg0`@T18!ia1j9%##}8yCVG9aBO3mx^Z|R`;q^SY$j`Gx6@}y
zER47!IC+yBT9q2nGn4d)8Aurre`}ld+)ER4VIqs<y4Rfln-;V=Mr#FgiNq!ij98B`
zJ<LlW$tdII$Q(G`>As7OXz-=F_(_~p2?({Zi8K~lZI&r&ce(wDAXohB44H_e;7)a8
zC3d(ULQK6YxSvf_TS}`fq*oWyt7_5|)XQCpceLDRRwNox_9I8}idGBn6m27692LMf
zIqu2sa6k3|k9)HAeG~0}$Vlu(Ipp1~{Z^VNf0N<aA6pa%Cs4^@<HS_{b8gG7mrBD?
zvRBYFD5Nq)&;Ag9dbT&Kq1zXNfA<G+-sar4R%z)hC>Q*0??jf~wBK0FY*m at FRio0<
zH&_xa{puxY5stMv16%b1KegeBev>F9IqNB8qUGD-yY=N59IchF+YroaDL7duHDtrq
zK$-cZt{W(>2Zgfv3)(&<(MY#)x(~C-$Rr*L4U|-kpO6M!X^(kpvbTT$S515~b>MF1
z^9Siim~{jZNMY|$Std%}*+eJgUkc?%1tq9q^bolNT>+_xjqGR>k+oyi6!>`Mpn-VK
zZ1+BdPU|0nMm=;s2R2m<`VK!z(=mWg8H+(TqJ>Z`j2LR7jL at hNgDCs}LjVY$&%eVB
z9 at DG6#%@TIb(q#5K(BceMtEV*MpM>F==uh at d;?BX(ho!xkRpX4!`uLC!1|LPBpseK
zF`xg#Z0gfk$5d;3j8WE{J9(VuIMc96hkF6KffynrC*B#z=tN$a#A&QJ9vD;`gGQb>
zury~-at at b~&TR+caBQ`aGmCP344+cUTJ;iV*42Z6uT+oynOe%GdWmD}>OtdIsy9SE
z#4rnFvL95($py2FUiOYH16WZ`0PJ33K>Sl|7Ie_47y($(!10G+57ePpKxqm#Y_#d4
zU#u<8w4iCD&b{9&_fnh$Y{HOyO!`VXGLk}ML2i^cL_-`~mIMB?F{<80l>W$FmZ<-s
z#jPmWU15kUQ6zL{;^?S>`H1mP<@4Wr+W-rQhx4Dh^}rob6dius;VZ#`!*9Fe at Y}dP
zaQMm{hp$9Ds at 7(wmlrw*ms-u)*7W7zM17&wm|m<!jb<lm*E+%3<wm34xDYj$J5h5s
zYImkP^_gg<x!mZ~TJ2zZVWBxQ-Kjk>y|7$6LnG9`KY2NGu{JY*l)Gv>c;t~M9*ul~
z=&jQ&HF{sPuN^$zm~S*MH)yF<o337sF1PBPPOT9%rW?Ubv(c$HmYd7%Xt7zXMfG;H
z+-NT^Ej3%6S~Y4mqRvI0EiFuUW}B_WWHbVtdKZF)>6!URAVokquP61!EUhjsPB*I2
zLcLK7YK at h8tJzqrH9FDCbgMpnexVkeY^+Q#)T@!OqT1Ez&UDmiMvJx9g<7z0p~}C7
z<;uQlaOh%lv36)44Ac)@xOnN%<z{RCpxqv<)>aO+FV|-~2inc$)=cft!otep0h*kv
z&2$a{#1c)n4zxOl7V75@`K}I~Y_vNqhbQ1Qv>u%Kv+hPzr<dvTLF*3wU7ijed-Tk)
zlV^id at u{E%sq1h=eCA|u^3(~VWaWu7m7|ZJIC1=J<=n|{I-WjzG=5v<)Z>p<o`@eg
zS2<&EAM<a|oIQQw<Rix`PaHpc^z^wk5A`OfRKz5eq~O`dDko2!e5`W%)cwbU`;8NN
z8DVC*)q-dXS78JGbh!dfh#sAuxp?68ozd|}PDCR0{ijcyh{Ph(o%X at tSnd4sg$Pz?
z!pO7&`_$p3YO~gMeD63cCBMiHF{0^C<dJbO5}n}JI~`9%$WBrm9&M|1=OS$9Nj~d2
zsJU=3aNLlTcI*igcbd(py@)&=ypF6q-Vkq!4KLI>*1987v=Iy68MSJ&H4rp|oYWf#
z+QI1Y6f7_G(b9D1;*E88vDxmJUJ!OA6Lv#=w7~URSCJcQeF<6nsvEuz9;LV>T+kGB
z9Srt2mKPRo3r<$AL~ndkbgt6^_UP7Iqw>i{wRS~>TAFU7>pCr6t8m1a7VC}aPP0`G
zPAxA!ajxEhGu|8>wtC$sGt-UNbfWXM=<T&uv)oU%<E>V+B?ULS;qxvrO{b*XAU?Sh
zCD%*pcKz+Ouf3&K?Q*?Qy?#4R=hUJ{J+q_iP513<N3+uq{1vLc@?cQ5OV at 8AH3hHK
z$n~K<VW3GUWH)LK<nq3D`Kzq5LGctabdOgyuKDSinOeIoV`N4$HAfq?;kf`KYARH!
zs4fM+EPLa$&I)E!+Gnwj^mk at 1M)yUvD@&LN1a}|$<vfO00>7`ihLckG_U*e8wW(2`
zt=C%7#pyOI36Cd_s-{}7cvm9<0pKRE*Kchxt=fuUai?G5@@|751Y`Uv&CH|~zWQcP
zH9N5UX>il3WA~d2U0W=j6^X*pN_`sn6773#53U)PYv~*a^cL39&|0~Xnu%DL0vEl#
zf1++BA0j$^p7SQdw3yi$IUu+l{HZpVrMPh^mWBOx)HPkzdRvlSJvjPU6FqKHU+>%*
zO;@YYfrS~og?gjYh6o4L(gEKRU7K;KH|C}4<jep<+-}!eX2`no8tvcL7KHq(-nIra
zDgSgOuhWvl)Tmwdz}n!V2mxqAhL$&M%_ZyQz at n^MbYNxaK;jM_Si+%cBZqH#W8=-j
zP~slqzO_6CQaU7+WLCX5WDI&R5;{%Bf|CmiwF}b=(fO;$cDr_ISq3dQ)?A#1`(Tb{
zD>z$g<G`9IwrWeQT3eC at P6H?(;k9+H24|$t+a_hzTBBa82B!hkiJwTy&7z#nDs^xq
zkU`UYSh{Mxg(NmxSJO9ek)+=oWdya}?z98*(oD)0rg0F>lsSWcbxCj_2-676LTwfw
z8b719QfCyWokzV`12<PS4D%9^_x4pBP;5|Zc}Ys3eHBk*5lLKF=$7IMPB8u|t`F8?
z1vO}cPCVCktu`Or-)gqo2V8gZcIT--T|Hp_l8w+c9UMzWX;`FXa0bmA3|*Q at IK|9e
z1y8kB59*{%aKe{6{e@?j+hFoR90upF<JQkyG~nWDm9|U)Q6 at OrYR;plmg*qs<e}4^
zo(Ty&*bo6>P_>i<9)FI$T?(CSL+<YFW3?+CM at Pfx@hi0%?*aOb5>!DlQ^!H<FF+Nr
z%O8`_1?R3Vo^LJ?^az7IYA$u7q_ogulr|OHTw+K<t_{;8m at yf?tF`t_tG)z70c%RL
zqtkdl)POOP*Ba~0#kx~_#&0T=AmW4yTi0T at AnjWK8k!XC^!AW4R~7o}Wpuf at OykWN
zR8epaOXLDRn7xV4HDaA<!KQM!3C<AxYQNZ^l4fcsuHeLS!*PiOslK?hP?KqMqJ*(&
zPa_Yw5)2I{!xTv(P9lXJXRTUr{|coa5pKJts|FN2MfrA`GtC80G2y_s34$cuZN@^7
zdaTw)-8zBxNP=rGEvw7mQ4d8IVQI6MiTrTXfyrdnx0b%vkh)~*dz^+F21xfVVe#4N
zMTw?bNMD)=`4I(4!Tvr@)jEnGB{)?D#mW70qbo%lNoK><>lMW5^A;qFlg=;C5({`O
z)E&?{L!KEVC+r~0KND0l>BFwJOoDW@?^3II!Njurw)ZJa1(O<EU&KD4kN?tglPJ1&
zWu{iEiY#dD`o-oF>R-xCEOY<#()9WILcLQr>1hLYeWnht1pWUlXHEtGHvFC3Kg{h2
z|K`9SWVYwN82nH+%-%cj6XDb0>FlB0p5W61&jrQop6n;X-_8yP&kqD)HS<SdZQ!oV
zcW3 at p=0Nt5+<zH(U*=1>-2=k|AIm&EFp;?uoXxy3`$yUT8vNtzQ at NLd4`r_o+>&X8
zi`jDS!?|O at ABP_ep2>c5 at cD2k_x-uQp1UJ>Tlo6$-N7dZek8m<b0Yjd13~b at WDx8O
zMuVL at cM4>l=)V#D$?K2)+Zk*f<$ZQ^C<v!UI6WyC8RTU?7*ri!(@Oe3o0A(_9 at G0V
zl1Rab9*+e4P2VTw(%((#RS;~`wfz}!kNMmCzNaXcg`V3jT<7#B7;%sJ+xxz!D3`^a
z+pW5`KO^ojfBF9|p6=SEzZ?J8_ui1Vak=#AczP8CP{98gagX`i`@W|rm)m=8Cv<Ip
zM%-im^8drH{f5`w^^JGmbMNcpqxT;>euATx4(2}{CKZSMg)jmwp-d2D`IqCL=s(E6
zA^xetr(Vw={!<-|Yg9Kq|J_8TKcJvC=Xl<+w;RDPgbKBr-rUmwm+D{tsSS5^V}HfE
zLHxnP`D8%cy>K{x_^1EMwjbOx`Qg>1l`pMnH$_BJApY#(e6c__M^0J=3pJxZE5Yl=
zDiwK*6>QAxawBQCm%h&(dh*h#OYi$fVHkvAZhvrla9>X);z;#jlEtqOo7o=Z4s{ln
z3g=P(t*Zx;6wizb{`SF~QV_w;9c$jM+<ox!bbIl2-w^)H{x62X?Uf3K%d};M=k-dv
zF}>8j*z8nrCs*pV%dgb{+7+U(@bC?mRm^TRG6!D0hEL$AgY4^Wym;D2?gmCMTdmA2
zG~2a-L$BTeWoN6(^a`TEZ~SU1;3-yS7p5<?hwi?yYVwrSlPYRJ!#6e-C^KChxdCc0
z76k=<Frzm%n4|{YWJN5LzwL$wBbYYbSW)@sV>dQbOKfbu=hY!lOk0@&h)%7N`oUZ7
zxWS5YTnjf+m;k6k;M3W<wIXpro{3B<ijvyJJIVFeX348pE0=52^Y>W^Ec*GG3QwyQ
za;*lX#f94jE6}CY8P7>mOiX$8fMX%Iq2g1m6+(s9_MK~9p0AOSttiNvoLuv&Mn8J7
zW7nD&u8!#0-nnDVJ5{nbyKY+Z#??xK)v2{ms$%kT_u7}P?n;BXi}p=xhoGA7)t+Ps
zM(T>wR+W0Sy|=g^Y~XncBgd8(mvEuif{u)$YA at 7hO1G>5U2hu8i<KpOVzLnxMbYi@
zTC}K?PeR=c^O}lIXi#r6xMU8dqB)p~%TiQtDX%H2W+cEBba2Iaocmy>zUDS4Qz5}Y
z3VC|r*4-Nv79FkC*F+n<P4I3d_M>Rmb(W0WsY*eX)VXaftrBFZ%iBw9+VZs+e5ZBQ
zy1QfVnj%U4q=P%RHE?v>_q)lkGmZoY_984Z%S<@bj*1wk%ol#Fd4 at 14UZ`Ja94you
z_0Caq*x#y*X6w*a<AL5shqu|I`%To{i<75cThQoFrnOI={svd<7;zfiS(7JEzs|8#
za%+9u)&K0o!gBlKH*U5*?ENuNa<?m!-k&7YyQg?rAX at rEJgRc{PS~^4jsC~49|S+7
z`R|*uW*?|^9%)WjA8l5b7i#Ai_jG5uReQr)p0Wu<NanKniVUv4aV;ZQ{=72VYA$-N
zd(-AN4HEsIoPBG(ara$s-c8pjjAsdPTh#~A)V0<**!3;j*T2>CBU9_2TO%4|yl4GO
zO$#W)r}sxUX!H~i-9Jga6ZKYmEj-7 at dmB^}g5mh2A&cbY@=`hg6hZ66?j*6+``EKu
z`++UVTc~gfnSJoqHN4u*L03?~bX{2y=qJY$;yiIiFq}0kJw&FLkc<h at w|8mkGujTD
z58p%+z2iU81h<{qTU*mh<klY9vGMD(k3HIl<Yd^s+NrIo2G&lccs*V0X~||~&OUa=
ziuEA>>|=k);Y~9cU5IT!IZZ~K8(ds&@cY<i4NS=t<LcJqqtdeO`Uw%_{^ge5$}T)o
zYh37DeCy`^r*FGS+7aW`K8`^=-7rpGIG5z&|8jxi2F00Xb3qjSrakK at k)Rd>xlsEn
z6VyohJTu)n0b3t$to+q&OVh1(EpAldh2-^L8 at GqaEv2PM|N3^koo=DWn#bG>e`SQ0
z&zw5P1lTukV*;X~S$hVkIaD<<UG7rZ+Z(7zuE9^hBdWFYJH<FKmqzQx`k=-3t at g}X
z>(*_py4I+)u-&9ITNTq3Htq_*&}MDBJrX=`t#>YxT09Hyw;Ho+TkqDqFp^feI8tr;
z*{ga{?u^=D5dPWRn8)qu^O}L59|KLxOD9h|Rk$G1CIhG5;$*$;Z;gUym3#*3YD4Oz
zr=ZB%?W8#&4ohCAeJ^dUw@)=2-^4uOvHFF2=h9HUZ5OS4&mxI-!Fg}K`muVevsGy6
zL-lf at +Ux-gyS_3}smxrtQn|3uJU_jFD7P at G_0Hw3XFa05u<pO@%Kpj&kDNXlKT at f*
zm(N#DR!;V<Ht^*R&MYlmUCXGX2A7$rTB%hQ=W#CIzLEKqyDT5!9R<@PM`8N=7nj~P
zbMeBH?#Y=>>qLz?E|OnQ<*m#k<e*Me-#KPCk2jQ?JW;*o441uw6pqJM!-?KP>GN+1
zx7c&iP)6|MSDp at c*^?7^oTh9+<hdHcq8Z(1!kzX~BZrgwQ2VaXb$o7_sV-!#e&ONz
z!os)urzBCSFKeZz-xdyPvc5U<?cs2B!Cihwn44|YYTp?SRu^>fH^M>r#?6`U3Wu=O
zy87;Lj5(hr0<p@>#pTBQ_k<&}i8TJraKOTv?+phklCZxO=6w*o(yUkicDTiP(`hjJ
z?hwLTz-r-X8%DH{^ZUZ0f8G;Vet(GHXoZu9Y55;WTTX(+9}LHR;l5DtheCWuYoq%Z
zA#wrV4~MWs`V#V3%%#`r=smrC*eKtdLf;>v{YY;oYXh;r6K?Ky0!pht+WSC3)sOXd
zWUAqPA)ciHwqbzuY*^@hExqyn?xQ5wc`igC)QH70jpoedYDGnfTpLaC{ND|62-g%a
zNl8omy>P%H at V_7CRP!H%+j}bLiErrx;eb{AU^rko+#gTix2~6(OXfZ*s#T at -hr-t+
zE;Qkl*R1{}VHn6=HnFNzKO7Qgq#*XjW6!UDGku;(tIoxbgt_+l*8E39G;ms#Uh&p&
zhHdH8PlSX_>ASApPu{-}%J-x2$HL)CWocQLKN)VS=zXQNSiv*?sc;ZR;r^$?VP?l|
zV((`{Qa{r+r&-a2=Ff_-<c((9m1e^N(4Pwjo!R*LFef?thhbktT2(~)kHVZZ=r4o=
z)#b&-s~-<XFu*g+b2i$a2nS|oSU~iP0*iTAW(t2vm$Oyj^+k2x?o{hdmpA$4a6tUF
z^2so#-+!zDxYxS<m2il5#ee^#`$D(B8V(v0uKd$(d-mYhJVgI29Go?__~%^A%Jcpg
zTp&J;=1Xd7R?KA?m0t&5LG~}hp;?<3p8Z!mQ750`w^LhK_>FGg8uY)`U^`9g?%(Ko
z)-JDnnhxyhH^b3Nr8&D$K{3t$TUFMLW0&7j1>HiPe+N|>uFHQP4qcvUb()Lk{{tf<
z=wCelAL+ftl7-Lk3w7K7DI8w$0RDEEch&IWTebfj4#B-rU!M)LL}s5u6*K=*akJWR
zKdK^GOz}J6$V$@Ze^K|&S@^GE4kcdyJj%DWICHV}1$VKy^t<k2>FV#<1>E%C;HgfF
zGXGs_)LpW==YP1zi%Y+67uDsZKd=iB{y)PZiZx~ym;O+LZ+Dg&GfP)r_D`3t{*k+)
z<j}_lwgs>wDsE=JveYyI4EH{8`muvm{N{s+KpsSY+w^Yb{B*mPxox8Yw9)ERQnzr{
zau?bC8y5wAYf9#B-?-$OMlH87a2){pTeYHt*D1QDS>&fZbVIPJ<nWC(y25(VksE4M
zB}ebv2myVW02b#Z8IJtTDlIkAC6f6A&I^32h1{EVtZx!oM=g!*S^owZ(}by!c=LuM
z>;uMRn}}k|<oc%Qq0uZ at 8ykRVerOu-DW2JS)A}}^IAi4=Y&PeM+x5D?CHW;I#|MI;
zgRQ1`H5 at RO&A`nb%-M=)mF<0?f;r4&TEciFxOHTSMwVw-!4z)dQWMp+>VflDW$C6H
z7Y1K@&%wh7?+%{W;-k!2a+6q&_P`za^UDkK2b^%+F}kR^Knp?d*s{26=se)O+4db<
V(uIr%G*R2WW4KLte}K5~{{w2iz+3<T

diff --git a/llvm/test/tools/llvm-cov/Inputs/binary-formats.wasm.proftext b/llvm/test/tools/llvm-cov/Inputs/binary-formats.wasm.proftext
deleted file mode 100644
index 20fc3816c2255a..00000000000000
--- a/llvm/test/tools/llvm-cov/Inputs/binary-formats.wasm.proftext
+++ /dev/null
@@ -1,4 +0,0 @@
-__main_argc_argv
-0x0
-1
-100
diff --git a/llvm/test/tools/llvm-cov/binary-formats.c b/llvm/test/tools/llvm-cov/binary-formats.c
index bb61b288cfc62e..a5bfc012860ec3 100644
--- a/llvm/test/tools/llvm-cov/binary-formats.c
+++ b/llvm/test/tools/llvm-cov/binary-formats.c
@@ -10,11 +10,4 @@ int main(int argc, const char *argv[]) {}
 // RUN: llvm-cov show %S/Inputs/binary-formats.v3.macho64l -instr-profile %t.profdata -path-equivalence=/tmp,%S %s | FileCheck %s
 // RUN: llvm-cov show %S/Inputs/binary-formats.v6.linux64l -instr-profile %t.profdata -path-equivalence=/tmp,%S %s | FileCheck %s
 
-// RUN: llvm-profdata merge %S/Inputs/binary-formats.wasm.proftext -o %t.wasm.profdata
-// NOTE: The wasm binary is built with the following command:
-//   clang -target wasm32-unknown-wasi %s -o %S/Inputs/binary-formats.v6.wasm32 \
-//     -mllvm -enable-name-compression=false \
-//     -fprofile-instr-generate -fcoverage-mapping -lwasi-emulated-getpid -lwasi-emulated-mman
-// RUN: llvm-cov show %S/Inputs/binary-formats.v6.wasm32 -instr-profile %t.wasm.profdata -path-equivalence=/tmp,%S %s | FileCheck %s
-
 // RUN: llvm-cov export %S/Inputs/binary-formats.macho64l -instr-profile %t.profdata | FileCheck %S/Inputs/binary-formats.canonical.json

>From 4e6fa78f4dc8ba01eb63ccaf010f985df6e4a361 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12 at gmail.com>
Date: Wed, 16 Oct 2024 13:43:36 +0100
Subject: [PATCH 54/82] [lldb][test] Explicitly add target triple to
 no_unique_address Shell tests (#112523)

Follow up to https://github.com/llvm/llvm-project/pull/111902.

Makes sure all the `no_unique_address` tests are in the same place and
we don't rely on the host target triple (which means we don't need to
account for `[[msvc::no_unique_address]]` on Windows).

Now that we don't compile with the host compiler, this patch also adds
`-c` to the compilation command since we don't actually need the linked
binary in the test anyway (and on Darwin linking through Clang requires
the `xcrun` prefix to set up the SDK paths, etc.). We already do this in
`no_unique_address-with-bitfields.cpp` anyway.
---
 .../SymbolFile/DWARF/{ => x86}/no_unique_address-alignment.cpp  | 2 +-
 .../DWARF/{ => x86}/no_unique_address-base-alignment.cpp        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename lldb/test/Shell/SymbolFile/DWARF/{ => x86}/no_unique_address-alignment.cpp (89%)
 rename lldb/test/Shell/SymbolFile/DWARF/{ => x86}/no_unique_address-base-alignment.cpp (91%)

diff --git a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-alignment.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-alignment.cpp
similarity index 89%
rename from lldb/test/Shell/SymbolFile/DWARF/no_unique_address-alignment.cpp
rename to lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-alignment.cpp
index 1488199a3ad2d3..e198bf0cafeaac 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-alignment.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-alignment.cpp
@@ -1,6 +1,6 @@
 // XFAIL: *
 
-// RUN: %clangxx_host -gdwarf -o %t %s
+// RUN: %clang --target=x86_64-apple-macosx -c -gdwarf -o %t %s
 // RUN: %lldb %t \
 // RUN:   -o "expr alignof(OverlappingFields)" \
 // RUN:   -o "expr sizeof(OverlappingFields)" \
diff --git a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-base-alignment.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-base-alignment.cpp
similarity index 91%
rename from lldb/test/Shell/SymbolFile/DWARF/no_unique_address-base-alignment.cpp
rename to lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-base-alignment.cpp
index 15d8de0e3ee988..c4bcfc473277f6 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-base-alignment.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-base-alignment.cpp
@@ -1,6 +1,6 @@
 // XFAIL: *
 
-// RUN: %clangxx_host -gdwarf -o %t %s
+// RUN: %clang --target=x86_64-apple-macosx -c -gdwarf -o %t %s
 // RUN: %lldb %t \
 // RUN:   -o "expr alignof(OverlappingDerived)" \
 // RUN:   -o "expr sizeof(OverlappingDerived)" \

>From 1dfb104eac73863b06751bea225ffa6ef589577f Mon Sep 17 00:00:00 2001
From: Sirui Mu <msrlancern at gmail.com>
Date: Wed, 16 Oct 2024 12:51:50 +0800
Subject: [PATCH 55/82] [mlir][LLVMIR] Add operand bundle support for
 llvm.intr.assume (#112143)

This patch adds operand bundle support for `llvm.intr.assume`.

This patch actually contains two parts:

- `llvm.intr.assume` now accepts operand bundle related attributes and
operands. `llvm.intr.assume` does not take constraint on the operand
bundles, but obviously only a few set of operand bundles are meaningful.
I plan to add some of those (e.g. `aligned` and `separate_storage` are
what interest me but other people may be interested in other operand
bundles as well) in future patches.

- The definitions of `llvm.call`, `llvm.invoke`, and
`llvm.call_intrinsic` actually define `op_bundle_tags` as an operation
property. It turns out this approach would introduce some unnecessary
burden if applied equally to the intrinsic operations because properties
are not available through `Operation *` but we have to operate on
`Operation *` during the import/export of intrinsics, so this PR changes
it from a property to an array attribute.

This patch relands commit d8fadad07c952c4aea967aefb0900e4e43ad0555.
---
 .../Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td   |  1 +
 .../mlir/Dialect/LLVMIR/LLVMDialect.td        |  2 +
 .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td   | 44 +++++++--
 .../include/mlir/Dialect/LLVMIR/LLVMOpBase.td | 25 +++--
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td   | 18 +---
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  |  2 +-
 .../include/mlir/Target/LLVMIR/ModuleImport.h |  2 +
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    | 96 ++++++++++++-------
 .../LLVMIR/LLVMIRToLLVMTranslation.cpp        |  6 ++
 .../LLVMIR/LLVMToLLVMIRTranslation.cpp        | 16 +++-
 .../Dialect/NVVM/LLVMIRToNVVMTranslation.cpp  |  6 ++
 mlir/lib/Target/LLVMIR/ModuleImport.cpp       | 32 ++++++-
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  | 37 ++++++-
 .../expand-then-convert-to-llvm.mlir          |  2 +-
 .../MemRefToLLVM/memref-to-llvm.mlir          |  4 +-
 mlir/test/Dialect/LLVMIR/inlining.mlir        |  4 +-
 mlir/test/Dialect/LLVMIR/roundtrip.mlir       | 27 ++++++
 mlir/test/Target/LLVMIR/Import/intrinsic.ll   | 12 ++-
 .../test/Target/LLVMIR/llvmir-intrinsics.mlir | 15 +++
 mlir/test/Target/LLVMIR/llvmir-invalid.mlir   |  2 +-
 20 files changed, 276 insertions(+), 77 deletions(-)

diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
index 0e38325f9891ac..e81db32bcaad03 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
@@ -71,6 +71,7 @@ class ArmSME_IntrOp<string mnemonic,
           /*bit requiresAccessGroup=*/0,
           /*bit requiresAliasAnalysis=*/0,
           /*bit requiresFastmath=*/0,
+          /*bit requiresOpBundles=*/0,
           /*list<int> immArgPositions=*/immArgPositions,
           /*list<string> immArgAttrNames=*/immArgAttrNames>;
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
index 27a2b418aadb2a..ea82f7f7b8e124 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
@@ -59,6 +59,8 @@ def LLVM_Dialect : Dialect {
     static StringRef getStructRetAttrName() { return "llvm.sret"; }
     static StringRef getWriteOnlyAttrName() { return "llvm.writeonly"; }
     static StringRef getZExtAttrName() { return "llvm.zeroext"; }
+    static StringRef getOpBundleSizesAttrName() { return "op_bundle_sizes"; }
+    static StringRef getOpBundleTagsAttrName() { return "op_bundle_tags"; }
     // TODO Restrict the usage of this to parameter attributes once there is an
     // alternative way of modeling memory effects on FunctionOpInterface.
     /// Name of the attribute that will cause the creation of a readnone memory
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index ab40c8ec4b6588..845c88b1be7750 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -120,7 +120,8 @@ def LLVM_Log2Op : LLVM_UnaryIntrOpF<"log2">;
 def LLVM_LogOp : LLVM_UnaryIntrOpF<"log">;
 def LLVM_Prefetch : LLVM_ZeroResultIntrOp<"prefetch", [0],
   /*traits=*/[], /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-  /*immArgPositions=*/[1, 2, 3], /*immArgAttrNames=*/["rw", "hint", "cache"]
+  /*requiresOpBundles=*/0, /*immArgPositions=*/[1, 2, 3],
+  /*immArgAttrNames=*/["rw", "hint", "cache"]
 > {
   let arguments = (ins LLVM_AnyPointer:$addr, I32Attr:$rw, I32Attr:$hint, I32Attr:$cache);
 }
@@ -176,7 +177,8 @@ class LLVM_MemcpyIntrOpBase<string name> :
      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
      DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>],
     /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1,
-    /*immArgPositions=*/[3], /*immArgAttrNames=*/["isVolatile"]> {
+    /*requiresOpBundles=*/0, /*immArgPositions=*/[3],
+    /*immArgAttrNames=*/["isVolatile"]> {
   dag args = (ins Arg<LLVM_AnyPointer,"",[MemWrite]>:$dst,
                   Arg<LLVM_AnyPointer,"",[MemRead]>:$src,
                   AnySignlessInteger:$len, I1Attr:$isVolatile);
@@ -206,7 +208,8 @@ def LLVM_MemcpyInlineOp :
      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
      DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>],
     /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1,
-    /*immArgPositions=*/[2, 3], /*immArgAttrNames=*/["len", "isVolatile"]> {
+    /*requiresOpBundles=*/0, /*immArgPositions=*/[2, 3],
+    /*immArgAttrNames=*/["len", "isVolatile"]> {
   dag args = (ins Arg<LLVM_AnyPointer,"",[MemWrite]>:$dst,
                   Arg<LLVM_AnyPointer,"",[MemRead]>:$src,
                   APIntAttr:$len, I1Attr:$isVolatile);
@@ -232,7 +235,8 @@ def LLVM_MemsetOp : LLVM_ZeroResultIntrOp<"memset", [0, 2],
      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
      DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>],
     /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1,
-    /*immArgPositions=*/[3], /*immArgAttrNames=*/["isVolatile"]> {
+    /*requiresOpBundles=*/0, /*immArgPositions=*/[3],
+    /*immArgAttrNames=*/["isVolatile"]> {
   dag args = (ins Arg<LLVM_AnyPointer,"",[MemWrite]>:$dst,
                   I8:$val, AnySignlessInteger:$len, I1Attr:$isVolatile);
   // Append the alias attributes defined by LLVM_IntrOpBase.
@@ -286,7 +290,8 @@ def LLVM_NoAliasScopeDeclOp
 class LLVM_LifetimeBaseOp<string opName> : LLVM_ZeroResultIntrOp<opName, [1],
     [DeclareOpInterfaceMethods<PromotableOpInterface>],
     /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-    /*immArgPositions=*/[0], /*immArgAttrNames=*/["size"]> {
+    /*requiresOpBundles=*/0, /*immArgPositions=*/[0],
+    /*immArgAttrNames=*/["size"]> {
   let arguments = (ins I64Attr:$size, LLVM_AnyPointer:$ptr);
   let assemblyFormat = "$size `,` $ptr attr-dict `:` qualified(type($ptr))";
 }
@@ -306,7 +311,8 @@ def LLVM_InvariantStartOp : LLVM_OneResultIntrOp<"invariant.start", [], [1],
 def LLVM_InvariantEndOp : LLVM_ZeroResultIntrOp<"invariant.end", [2],
     [DeclareOpInterfaceMethods<PromotableOpInterface>],
     /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-    /*immArgPositions=*/[1], /*immArgAttrNames=*/["size"]> {
+    /*requiresOpBundles=*/0, /*immArgPositions=*/[1],
+    /*immArgAttrNames=*/["size"]> {
   let arguments = (ins LLVM_DefaultPointer:$start,
                        I64Attr:$size,
                        LLVM_AnyPointer:$ptr);
@@ -368,7 +374,7 @@ class LLVM_ConstrainedIntr<string mnem, int numArgs,
     SmallVector<Value> mlirOperands;
     SmallVector<NamedAttribute> mlirAttrs;
     if (failed(moduleImport.convertIntrinsicArguments(
-        llvmOperands.take_front( }] # numArgs # [{),
+        llvmOperands.take_front( }] # numArgs # [{), {}, false,
         {}, {}, mlirOperands, mlirAttrs))) {
       return failure();
     }
@@ -429,7 +435,26 @@ def LLVM_USHLSat : LLVM_BinarySameArgsIntrOpI<"ushl.sat">;
 //
 
 def LLVM_AssumeOp
-  : LLVM_ZeroResultIntrOp<"assume", []>, Arguments<(ins I1:$cond)>;
+    : LLVM_ZeroResultIntrOp<"assume", /*overloadedOperands=*/[], /*traits=*/[],
+                            /*requiresAccessGroup=*/0,
+                            /*requiresAliasAnalysis=*/0,
+                            /*requiresOpBundles=*/1> {
+  dag args = (ins I1:$cond);
+  let arguments = !con(args, opBundleArgs);
+
+  let assemblyFormat = [{
+    $cond
+    ( custom<OpBundles>($op_bundle_operands, type($op_bundle_operands),
+                        $op_bundle_tags)^ )?
+    `:` type($cond) attr-dict
+  }];
+
+  let builders = [
+    OpBuilder<(ins "Value":$cond)>
+  ];
+
+  let hasVerifier = 1;
+}
 
 def LLVM_SSACopyOp : LLVM_OneResultIntrOp<"ssa.copy", [], [0],
                                             [Pure, SameOperandsAndResultType]> {
@@ -992,7 +1017,8 @@ def LLVM_DebugTrap : LLVM_ZeroResultIntrOp<"debugtrap">;
 def LLVM_UBSanTrap : LLVM_ZeroResultIntrOp<"ubsantrap",
   /*overloadedOperands=*/[], /*traits=*/[],
   /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-  /*immArgPositions=*/[0], /*immArgAttrNames=*/["failureKind"]> {
+  /*requiresOpBundles=*/0, /*immArgPositions=*/[0],
+  /*immArgAttrNames=*/["failureKind"]> {
   let arguments = (ins I8Attr:$failureKind);
 }
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
index c3d352d8d0dd48..a38dafa4d9cf34 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -291,7 +291,7 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
                       list<int> overloadedResults, list<int> overloadedOperands,
                       list<Trait> traits, int numResults,
                       bit requiresAccessGroup = 0, bit requiresAliasAnalysis = 0,
-                      bit requiresFastmath = 0,
+                      bit requiresFastmath = 0, bit requiresOpBundles = 0,
                       list<int> immArgPositions = [],
                       list<string> immArgAttrNames = []>
     : LLVM_OpBase<dialect, opName, !listconcat(
@@ -313,6 +313,12 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
                  OptionalAttr<LLVM_AliasScopeArrayAttr>:$noalias_scopes,
                  OptionalAttr<LLVM_TBAATagArrayAttr>:$tbaa),
             (ins )));
+  dag opBundleArgs = !if(!gt(requiresOpBundles, 0),
+                         (ins VariadicOfVariadic<LLVM_Type,
+                                "op_bundle_sizes">:$op_bundle_operands,
+                              DenseI32ArrayAttr:$op_bundle_sizes,
+                              OptionalAttr<ArrayAttr>:$op_bundle_tags),
+                         (ins ));
   string llvmEnumName = enumName;
   string overloadedResultsCpp =  "{" # !interleave(overloadedResults, ", ") # "}";
   string overloadedOperandsCpp =  "{" # !interleave(overloadedOperands, ", ") # "}";
@@ -336,6 +342,8 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
     SmallVector<NamedAttribute> mlirAttrs;
     if (failed(moduleImport.convertIntrinsicArguments(
       llvmOperands,
+      llvmOpBundles,
+      }] # !if(!gt(requiresOpBundles, 0), "true", "false") # [{,
       }] # immArgPositionsCpp # [{,
       }] # immArgAttrNamesCpp # [{,
       mlirOperands,
@@ -381,12 +389,14 @@ class LLVM_IntrOp<string mnem, list<int> overloadedResults,
                   list<int> overloadedOperands, list<Trait> traits,
                   int numResults, bit requiresAccessGroup = 0,
                   bit requiresAliasAnalysis = 0, bit requiresFastmath = 0,
+                  bit requiresOpBundles = 0,
                   list<int> immArgPositions = [],
                   list<string> immArgAttrNames = []>
     : LLVM_IntrOpBase<LLVM_Dialect, "intr." # mnem, !subst(".", "_", mnem),
                       overloadedResults, overloadedOperands, traits,
                       numResults, requiresAccessGroup, requiresAliasAnalysis,
-                      requiresFastmath, immArgPositions, immArgAttrNames>;
+                      requiresFastmath, requiresOpBundles, immArgPositions,
+                      immArgAttrNames>;
 
 // Base class for LLVM intrinsic operations returning no results. Places the
 // intrinsic into the LLVM dialect and prefixes its name with "intr.".
@@ -406,11 +416,13 @@ class LLVM_ZeroResultIntrOp<string mnem, list<int> overloadedOperands = [],
                             list<Trait> traits = [],
                             bit requiresAccessGroup = 0,
                             bit requiresAliasAnalysis = 0,
+                            bit requiresOpBundles = 0,
                             list<int> immArgPositions = [],
                             list<string> immArgAttrNames = []>
     : LLVM_IntrOp<mnem, [], overloadedOperands, traits, /*numResults=*/0,
                   requiresAccessGroup, requiresAliasAnalysis,
-                  /*requiresFastMath=*/0, immArgPositions, immArgAttrNames>;
+                  /*requiresFastMath=*/0, requiresOpBundles, immArgPositions,
+                  immArgAttrNames>;
 
 // Base class for LLVM intrinsic operations returning one result. Places the
 // intrinsic into the LLVM dialect and prefixes its name with "intr.". This is
@@ -422,11 +434,12 @@ class LLVM_OneResultIntrOp<string mnem, list<int> overloadedResults = [],
                            list<int> overloadedOperands = [],
                            list<Trait> traits = [],
                            bit requiresFastmath = 0,
-                          list<int> immArgPositions = [],
-                          list<string> immArgAttrNames = []>
+                           list<int> immArgPositions = [],
+                           list<string> immArgAttrNames = []>
     : LLVM_IntrOp<mnem, overloadedResults, overloadedOperands, traits, 1,
                   /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-                  requiresFastmath, immArgPositions, immArgAttrNames>;
+                  requiresFastmath, /*requiresOpBundles=*/0, immArgPositions,
+                  immArgAttrNames>;
 
 def LLVM_OneResultOpBuilder :
   OpBuilder<(ins "Type":$resultType, "ValueRange":$operands,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index bbca7bc7286acb..d5def510a904d3 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -559,11 +559,7 @@ def LLVM_InvokeOp : LLVM_Op<"invoke", [
                    VariadicOfVariadic<LLVM_Type,
                                       "op_bundle_sizes">:$op_bundle_operands,
                    DenseI32ArrayAttr:$op_bundle_sizes,
-                   DefaultValuedProperty<
-                     ArrayProperty<StringProperty, "operand bundle tags">,
-                     "ArrayRef<std::string>{}",
-                     "SmallVector<std::string>{}"
-                   >:$op_bundle_tags);
+                   OptionalAttr<ArrayAttr>:$op_bundle_tags);
   let results = (outs Optional<LLVM_Type>:$result);
   let successors = (successor AnySuccessor:$normalDest,
                               AnySuccessor:$unwindDest);
@@ -678,11 +674,7 @@ def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
                   VariadicOfVariadic<LLVM_Type,
                                      "op_bundle_sizes">:$op_bundle_operands,
                   DenseI32ArrayAttr:$op_bundle_sizes,
-                  DefaultValuedProperty<
-                    ArrayProperty<StringProperty, "operand bundle tags">,
-                    "ArrayRef<std::string>{}",
-                    "SmallVector<std::string>{}"
-                  >:$op_bundle_tags);
+                  OptionalAttr<ArrayAttr>:$op_bundle_tags);
   // Append the aliasing related attributes defined in LLVM_MemAccessOpBase.
   let arguments = !con(args, aliasAttrs);
   let results = (outs Optional<LLVM_Type>:$result);
@@ -1930,11 +1922,7 @@ def LLVM_CallIntrinsicOp
                        VariadicOfVariadic<LLVM_Type,
                                           "op_bundle_sizes">:$op_bundle_operands,
                        DenseI32ArrayAttr:$op_bundle_sizes,
-                       DefaultValuedProperty<
-                         ArrayProperty<StringProperty, "operand bundle tags">,
-                         "ArrayRef<std::string>{}",
-                         "SmallVector<std::string>{}"
-                       >:$op_bundle_tags);
+                       OptionalAttr<ArrayAttr>:$op_bundle_tags);
   let results = (outs Optional<LLVM_Type>:$results);
   let llvmBuilder = [{
     return convertCallLLVMIntrinsicOp(op, builder, moduleTranslation);
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index c40ae4b1016b49..3695708439d91f 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -98,7 +98,7 @@ class ROCDL_IntrOp<string mnemonic, list<int> overloadedResults,
   LLVM_IntrOpBase<ROCDL_Dialect,  mnemonic,
     "amdgcn_" # !subst(".", "_", mnemonic), overloadedResults,
     overloadedOperands, traits, numResults, requiresAccessGroup,
-    requiresAliasAnalysis, 0, immArgPositions, immArgAttrNames>;
+    requiresAliasAnalysis, 0, 0, immArgPositions, immArgAttrNames>;
 
 //===----------------------------------------------------------------------===//
 // ROCDL special register op definitions
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
index 9f300bcafea537..bbb7af58d27393 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
@@ -243,6 +243,8 @@ class ModuleImport {
   /// corresponding MLIR attribute names.
   LogicalResult
   convertIntrinsicArguments(ArrayRef<llvm::Value *> values,
+                            ArrayRef<llvm::OperandBundleUse> opBundles,
+                            bool requiresOpBundles,
                             ArrayRef<unsigned> immArgPositions,
                             ArrayRef<StringLiteral> immArgAttrNames,
                             SmallVectorImpl<Value> &valuesOut,
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 12ed8cc88ae7b7..cc73878a64ff67 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -241,13 +241,18 @@ static void printOneOpBundle(OpAsmPrinter &p, OperandRange operands,
 static void printOpBundles(OpAsmPrinter &p, Operation *op,
                            OperandRangeRange opBundleOperands,
                            TypeRangeRange opBundleOperandTypes,
-                           ArrayRef<std::string> opBundleTags) {
+                           std::optional<ArrayAttr> opBundleTags) {
+  if (opBundleOperands.empty())
+    return;
+  assert(opBundleTags && "expect operand bundle tags");
+
   p << "[";
   llvm::interleaveComma(
-      llvm::zip(opBundleOperands, opBundleOperandTypes, opBundleTags), p,
+      llvm::zip(opBundleOperands, opBundleOperandTypes, *opBundleTags), p,
       [&p](auto bundle) {
+        auto bundleTag = cast<StringAttr>(std::get<2>(bundle)).getValue();
         printOneOpBundle(p, std::get<0>(bundle), std::get<1>(bundle),
-                         std::get<2>(bundle));
+                         bundleTag);
       });
   p << "]";
 }
@@ -256,7 +261,7 @@ static ParseResult parseOneOpBundle(
     OpAsmParser &p,
     SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> &opBundleOperands,
     SmallVector<SmallVector<Type>> &opBundleOperandTypes,
-    SmallVector<std::string> &opBundleTags) {
+    SmallVector<Attribute> &opBundleTags) {
   SMLoc currentParserLoc = p.getCurrentLocation();
   SmallVector<OpAsmParser::UnresolvedOperand> operands;
   SmallVector<Type> types;
@@ -276,7 +281,7 @@ static ParseResult parseOneOpBundle(
 
   opBundleOperands.push_back(std::move(operands));
   opBundleOperandTypes.push_back(std::move(types));
-  opBundleTags.push_back(std::move(tag));
+  opBundleTags.push_back(StringAttr::get(p.getContext(), tag));
 
   return success();
 }
@@ -285,16 +290,17 @@ static std::optional<ParseResult> parseOpBundles(
     OpAsmParser &p,
     SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> &opBundleOperands,
     SmallVector<SmallVector<Type>> &opBundleOperandTypes,
-    SmallVector<std::string> &opBundleTags) {
+    ArrayAttr &opBundleTags) {
   if (p.parseOptionalLSquare())
     return std::nullopt;
 
   if (succeeded(p.parseOptionalRSquare()))
     return success();
 
+  SmallVector<Attribute> opBundleTagAttrs;
   auto bundleParser = [&] {
     return parseOneOpBundle(p, opBundleOperands, opBundleOperandTypes,
-                            opBundleTags);
+                            opBundleTagAttrs);
   };
   if (p.parseCommaSeparatedList(bundleParser))
     return failure();
@@ -302,6 +308,8 @@ static std::optional<ParseResult> parseOpBundles(
   if (p.parseRSquare())
     return failure();
 
+  opBundleTags = ArrayAttr::get(p.getContext(), opBundleTagAttrs);
+
   return success();
 }
 
@@ -1039,7 +1047,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr,
         /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1066,7 +1074,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
         /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr,
         /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
         /*access_groups=*/nullptr,
         /*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1079,7 +1087,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
         /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1092,7 +1100,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
         /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1192,12 +1200,20 @@ LogicalResult verifyCallOpVarCalleeType(OpTy callOp) {
 template <typename OpType>
 static LogicalResult verifyOperandBundles(OpType &op) {
   OperandRangeRange opBundleOperands = op.getOpBundleOperands();
-  ArrayRef<std::string> opBundleTags = op.getOpBundleTags();
+  std::optional<ArrayAttr> opBundleTags = op.getOpBundleTags();
 
-  if (opBundleTags.size() != opBundleOperands.size())
+  auto isStringAttr = [](Attribute tagAttr) {
+    return isa<StringAttr>(tagAttr);
+  };
+  if (opBundleTags && !llvm::all_of(*opBundleTags, isStringAttr))
+    return op.emitError("operand bundle tag must be a StringAttr");
+
+  size_t numOpBundles = opBundleOperands.size();
+  size_t numOpBundleTags = opBundleTags ? opBundleTags->size() : 0;
+  if (numOpBundles != numOpBundleTags)
     return op.emitError("expected ")
-           << opBundleOperands.size()
-           << " operand bundle tags, but actually got " << opBundleTags.size();
+           << numOpBundles << " operand bundle tags, but actually got "
+           << numOpBundleTags;
 
   return success();
 }
@@ -1329,7 +1345,8 @@ void CallOp::print(OpAsmPrinter &p) {
                           {getCalleeAttrName(), getTailCallKindAttrName(),
                            getVarCalleeTypeAttrName(), getCConvAttrName(),
                            getOperandSegmentSizesAttrName(),
-                           getOpBundleSizesAttrName()});
+                           getOpBundleSizesAttrName(),
+                           getOpBundleTagsAttrName()});
 
   p << " : ";
   if (!isDirect)
@@ -1437,7 +1454,7 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
   SmallVector<OpAsmParser::UnresolvedOperand> operands;
   SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> opBundleOperands;
   SmallVector<SmallVector<Type>> opBundleOperandTypes;
-  SmallVector<std::string> opBundleTags;
+  ArrayAttr opBundleTags;
 
   // Default to C Calling Convention if no keyword is provided.
   result.addAttribute(
@@ -1483,9 +1500,9 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
           parser, opBundleOperands, opBundleOperandTypes, opBundleTags);
       result && failed(*result))
     return failure();
-  if (!opBundleTags.empty())
-    result.getOrAddProperties<CallOp::Properties>().op_bundle_tags =
-        std::move(opBundleTags);
+  if (opBundleTags && !opBundleTags.empty())
+    result.addAttribute(CallOp::getOpBundleTagsAttrName(result.name).getValue(),
+                        opBundleTags);
 
   if (parser.parseOptionalAttrDict(result.attributes))
     return failure();
@@ -1525,8 +1542,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
   auto calleeType = func.getFunctionType();
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType), SymbolRefAttr::get(func), ops,
-        normalOps, unwindOps, nullptr, nullptr, {}, std::nullopt, normal,
-        unwind);
+        normalOps, unwindOps, nullptr, nullptr, {}, {}, normal, unwind);
 }
 
 void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys,
@@ -1535,7 +1551,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys,
                      ValueRange unwindOps) {
   build(builder, state, tys,
         /*var_callee_type=*/nullptr, callee, ops, normalOps, unwindOps, nullptr,
-        nullptr, {}, std::nullopt, normal, unwind);
+        nullptr, {}, {}, normal, unwind);
 }
 
 void InvokeOp::build(OpBuilder &builder, OperationState &state,
@@ -1544,7 +1560,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state,
                      Block *unwind, ValueRange unwindOps) {
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType), callee, ops, normalOps, unwindOps,
-        nullptr, nullptr, {}, std::nullopt, normal, unwind);
+        nullptr, nullptr, {}, {}, normal, unwind);
 }
 
 SuccessorOperands InvokeOp::getSuccessorOperands(unsigned index) {
@@ -1634,7 +1650,8 @@ void InvokeOp::print(OpAsmPrinter &p) {
   p.printOptionalAttrDict((*this)->getAttrs(),
                           {getCalleeAttrName(), getOperandSegmentSizeAttr(),
                            getCConvAttrName(), getVarCalleeTypeAttrName(),
-                           getOpBundleSizesAttrName()});
+                           getOpBundleSizesAttrName(),
+                           getOpBundleTagsAttrName()});
 
   p << " : ";
   if (!isDirect)
@@ -1657,7 +1674,7 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
   TypeAttr varCalleeType;
   SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> opBundleOperands;
   SmallVector<SmallVector<Type>> opBundleOperandTypes;
-  SmallVector<std::string> opBundleTags;
+  ArrayAttr opBundleTags;
   Block *normalDest, *unwindDest;
   SmallVector<Value, 4> normalOperands, unwindOperands;
   Builder &builder = parser.getBuilder();
@@ -1703,9 +1720,10 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
           parser, opBundleOperands, opBundleOperandTypes, opBundleTags);
       result && failed(*result))
     return failure();
-  if (!opBundleTags.empty())
-    result.getOrAddProperties<InvokeOp::Properties>().op_bundle_tags =
-        std::move(opBundleTags);
+  if (opBundleTags && !opBundleTags.empty())
+    result.addAttribute(
+        InvokeOp::getOpBundleTagsAttrName(result.name).getValue(),
+        opBundleTags);
 
   if (parser.parseOptionalAttrDict(result.attributes))
     return failure();
@@ -3333,7 +3351,7 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::StringAttr intrin, mlir::ValueRange args) {
   build(builder, state, /*resultTypes=*/TypeRange{}, intrin, args,
         FastmathFlagsAttr{},
-        /*op_bundle_operands=*/{});
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
 }
 
 void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
@@ -3341,14 +3359,14 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::LLVM::FastmathFlagsAttr fastMathFlags) {
   build(builder, state, /*resultTypes=*/TypeRange{}, intrin, args,
         fastMathFlags,
-        /*op_bundle_operands=*/{});
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
 }
 
 void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::Type resultType, mlir::StringAttr intrin,
                             mlir::ValueRange args) {
   build(builder, state, {resultType}, intrin, args, FastmathFlagsAttr{},
-        /*op_bundle_operands=*/{});
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
 }
 
 void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
@@ -3356,7 +3374,7 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::StringAttr intrin, mlir::ValueRange args,
                             mlir::LLVM::FastmathFlagsAttr fastMathFlags) {
   build(builder, state, resultTypes, intrin, args, fastMathFlags,
-        /*op_bundle_operands=*/{});
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
 }
 
 //===----------------------------------------------------------------------===//
@@ -3413,6 +3431,18 @@ void InlineAsmOp::getEffects(
   }
 }
 
+//===----------------------------------------------------------------------===//
+// AssumeOp (intrinsic)
+//===----------------------------------------------------------------------===//
+
+void LLVM::AssumeOp::build(OpBuilder &builder, OperationState &state,
+                           mlir::Value cond) {
+  return build(builder, state, cond, /*op_bundle_operands=*/{},
+               /*op_bundle_tags=*/{});
+}
+
+LogicalResult LLVM::AssumeOp::verify() { return verifyOperandBundles(*this); }
+
 //===----------------------------------------------------------------------===//
 // masked_gather (intrinsic)
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
index d034e576dfc579..4fd043c7c93e68 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
@@ -68,6 +68,12 @@ static LogicalResult convertIntrinsicImpl(OpBuilder &odsBuilder,
   if (isConvertibleIntrinsic(intrinsicID)) {
     SmallVector<llvm::Value *> args(inst->args());
     ArrayRef<llvm::Value *> llvmOperands(args);
+
+    SmallVector<llvm::OperandBundleUse> llvmOpBundles;
+    llvmOpBundles.reserve(inst->getNumOperandBundles());
+    for (unsigned i = 0; i < inst->getNumOperandBundles(); ++i)
+      llvmOpBundles.push_back(inst->getOperandBundleAt(i));
+
 #include "mlir/Dialect/LLVMIR/LLVMIntrinsicFromLLVMIRConversions.inc"
   }
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
index a8595d14ccf2e5..2084e527773ca8 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
@@ -114,17 +114,27 @@ convertOperandBundle(OperandRange bundleOperands, StringRef bundleTag,
 }
 
 static SmallVector<llvm::OperandBundleDef>
-convertOperandBundles(OperandRangeRange bundleOperands,
-                      ArrayRef<std::string> bundleTags,
+convertOperandBundles(OperandRangeRange bundleOperands, ArrayAttr bundleTags,
                       LLVM::ModuleTranslation &moduleTranslation) {
   SmallVector<llvm::OperandBundleDef> bundles;
   bundles.reserve(bundleOperands.size());
 
-  for (auto [operands, tag] : llvm::zip_equal(bundleOperands, bundleTags))
+  for (auto [operands, tagAttr] : llvm::zip_equal(bundleOperands, bundleTags)) {
+    StringRef tag = cast<StringAttr>(tagAttr).getValue();
     bundles.push_back(convertOperandBundle(operands, tag, moduleTranslation));
+  }
   return bundles;
 }
 
+static SmallVector<llvm::OperandBundleDef>
+convertOperandBundles(OperandRangeRange bundleOperands,
+                      std::optional<ArrayAttr> bundleTags,
+                      LLVM::ModuleTranslation &moduleTranslation) {
+  if (!bundleTags)
+    return {};
+  return convertOperandBundles(bundleOperands, *bundleTags, moduleTranslation);
+}
+
 /// Builder for LLVM_CallIntrinsicOp
 static LogicalResult
 convertCallLLVMIntrinsicOp(CallIntrinsicOp op, llvm::IRBuilderBase &builder,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
index bc830a77f3c580..2c0b665ad0d833 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
@@ -50,6 +50,12 @@ static LogicalResult convertIntrinsicImpl(OpBuilder &odsBuilder,
   if (isConvertibleIntrinsic(intrinsicID)) {
     SmallVector<llvm::Value *> args(inst->args());
     ArrayRef<llvm::Value *> llvmOperands(args);
+
+    SmallVector<llvm::OperandBundleUse> llvmOpBundles;
+    llvmOpBundles.reserve(inst->getNumOperandBundles());
+    for (unsigned i = 0; i < inst->getNumOperandBundles(); ++i)
+      llvmOpBundles.push_back(inst->getOperandBundleAt(i));
+
 #include "mlir/Dialect/LLVMIR/NVVMFromLLVMIRConversions.inc"
   }
 
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index bd861f3a69e53c..6e97b2a50af8a1 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -1311,7 +1311,8 @@ ModuleImport::convertValues(ArrayRef<llvm::Value *> values) {
 }
 
 LogicalResult ModuleImport::convertIntrinsicArguments(
-    ArrayRef<llvm::Value *> values, ArrayRef<unsigned> immArgPositions,
+    ArrayRef<llvm::Value *> values, ArrayRef<llvm::OperandBundleUse> opBundles,
+    bool requiresOpBundles, ArrayRef<unsigned> immArgPositions,
     ArrayRef<StringLiteral> immArgAttrNames, SmallVectorImpl<Value> &valuesOut,
     SmallVectorImpl<NamedAttribute> &attrsOut) {
   assert(immArgPositions.size() == immArgAttrNames.size() &&
@@ -1341,6 +1342,35 @@ LogicalResult ModuleImport::convertIntrinsicArguments(
     valuesOut.push_back(*mlirValue);
   }
 
+  SmallVector<int> opBundleSizes;
+  SmallVector<Attribute> opBundleTagAttrs;
+  if (requiresOpBundles) {
+    opBundleSizes.reserve(opBundles.size());
+    opBundleTagAttrs.reserve(opBundles.size());
+
+    for (const llvm::OperandBundleUse &bundle : opBundles) {
+      opBundleSizes.push_back(bundle.Inputs.size());
+      opBundleTagAttrs.push_back(StringAttr::get(context, bundle.getTagName()));
+
+      for (const llvm::Use &opBundleOperand : bundle.Inputs) {
+        auto operandMlirValue = convertValue(opBundleOperand.get());
+        if (failed(operandMlirValue))
+          return failure();
+        valuesOut.push_back(*operandMlirValue);
+      }
+    }
+
+    auto opBundleSizesAttr = DenseI32ArrayAttr::get(context, opBundleSizes);
+    auto opBundleSizesAttrNameAttr =
+        StringAttr::get(context, LLVMDialect::getOpBundleSizesAttrName());
+    attrsOut.push_back({opBundleSizesAttrNameAttr, opBundleSizesAttr});
+
+    auto opBundleTagsAttr = ArrayAttr::get(context, opBundleTagAttrs);
+    auto opBundleTagsAttrNameAttr =
+        StringAttr::get(context, LLVMDialect::getOpBundleTagsAttrName());
+    attrsOut.push_back({opBundleTagsAttrNameAttr, opBundleTagsAttr});
+  }
+
   return success();
 }
 
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 6e005f9ec5df85..ceb8ba3b33818b 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -55,6 +55,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <numeric>
 #include <optional>
 
 #define DEBUG_TYPE "llvm-dialect-to-llvm-ir"
@@ -854,8 +855,40 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall(
          "LLVM `immArgPositions` and MLIR `immArgAttrNames` should have equal "
          "length");
 
+  SmallVector<llvm::OperandBundleDef> opBundles;
+  size_t numOpBundleOperands = 0;
+  auto opBundleSizesAttr = cast_if_present<DenseI32ArrayAttr>(
+      intrOp->getAttr(LLVMDialect::getOpBundleSizesAttrName()));
+  auto opBundleTagsAttr = cast_if_present<ArrayAttr>(
+      intrOp->getAttr(LLVMDialect::getOpBundleTagsAttrName()));
+
+  if (opBundleSizesAttr && opBundleTagsAttr) {
+    ArrayRef<int> opBundleSizes = opBundleSizesAttr.asArrayRef();
+    assert(opBundleSizes.size() == opBundleTagsAttr.size() &&
+           "operand bundles and tags do not match");
+
+    numOpBundleOperands =
+        std::accumulate(opBundleSizes.begin(), opBundleSizes.end(), size_t(0));
+    assert(numOpBundleOperands <= intrOp->getNumOperands() &&
+           "operand bundle operands is more than the number of operands");
+
+    ValueRange operands = intrOp->getOperands().take_back(numOpBundleOperands);
+    size_t nextOperandIdx = 0;
+    opBundles.reserve(opBundleSizesAttr.size());
+
+    for (auto [opBundleTagAttr, bundleSize] :
+         llvm::zip(opBundleTagsAttr, opBundleSizes)) {
+      auto bundleTag = cast<StringAttr>(opBundleTagAttr).str();
+      auto bundleOperands = moduleTranslation.lookupValues(
+          operands.slice(nextOperandIdx, bundleSize));
+      opBundles.emplace_back(std::move(bundleTag), std::move(bundleOperands));
+      nextOperandIdx += bundleSize;
+    }
+  }
+
   // Map operands and attributes to LLVM values.
-  auto operands = moduleTranslation.lookupValues(intrOp->getOperands());
+  auto opOperands = intrOp->getOperands().drop_back(numOpBundleOperands);
+  auto operands = moduleTranslation.lookupValues(opOperands);
   SmallVector<llvm::Value *> args(immArgPositions.size() + operands.size());
   for (auto [immArgPos, immArgName] :
        llvm::zip(immArgPositions, immArgAttrNames)) {
@@ -890,7 +923,7 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall(
   llvm::Function *llvmIntr = llvm::Intrinsic::getOrInsertDeclaration(
       module, intrinsic, overloadedTypes);
 
-  return builder.CreateCall(llvmIntr, args);
+  return builder.CreateCall(llvmIntr, args, opBundles);
 }
 
 /// Given a single MLIR operation, create the corresponding LLVM IR operation
diff --git a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
index b86103422b0745..55b1bc9c545a85 100644
--- a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
@@ -684,7 +684,7 @@ func.func @collapse_static_shape_with_non_identity_layout(%arg: memref<1x1x8x8xf
 // CHECK: %[[INT_TO_PTR:.*]] = llvm.ptrtoint %[[BUFF_ADDR]] : !llvm.ptr to i64
 // CHECK: %[[AND:.*]] = llvm.and %[[INT_TO_PTR]], {{.*}}  : i64
 // CHECK: %[[CMP:.*]] = llvm.icmp "eq" %[[AND]], {{.*}} : i64
-// CHECK: "llvm.intr.assume"(%[[CMP]]) : (i1) -> ()
+// CHECK: llvm.intr.assume %[[CMP]] : i1
 // CHECK: %[[LD_ADDR:.*]] = llvm.getelementptr %[[BUFF_ADDR]][%{{.*}}] : (!llvm.ptr, i64) -> !llvm.ptr, f32
 // CHECK: %[[VAL:.*]] = llvm.load %[[LD_ADDR]] : !llvm.ptr -> f32
 // CHECK: return %[[VAL]] : f32
diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
index 9dc22abf143bf0..48dc9079333d4f 100644
--- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
@@ -160,7 +160,7 @@ func.func @assume_alignment(%0 : memref<4x4xf16>) {
   // CHECK-NEXT: %[[INT:.*]] = llvm.ptrtoint %[[PTR]] : !llvm.ptr to i64
   // CHECK-NEXT: %[[MASKED_PTR:.*]] = llvm.and %[[INT]], %[[MASK:.*]] : i64
   // CHECK-NEXT: %[[CONDITION:.*]] = llvm.icmp "eq" %[[MASKED_PTR]], %[[ZERO]] : i64
-  // CHECK-NEXT: "llvm.intr.assume"(%[[CONDITION]]) : (i1) -> ()
+  // CHECK-NEXT: llvm.intr.assume %[[CONDITION]] : i1
   memref.assume_alignment %0, 16 : memref<4x4xf16>
   return
 }
@@ -177,7 +177,7 @@ func.func @assume_alignment_w_offset(%0 : memref<4x4xf16, strided<[?, ?], offset
   // CHECK-NEXT: %[[INT:.*]] = llvm.ptrtoint %[[BUFF_ADDR]] : !llvm.ptr to i64
   // CHECK-NEXT: %[[MASKED_PTR:.*]] = llvm.and %[[INT]], %[[MASK:.*]] : i64
   // CHECK-NEXT: %[[CONDITION:.*]] = llvm.icmp "eq" %[[MASKED_PTR]], %[[ZERO]] : i64
-  // CHECK-NEXT: "llvm.intr.assume"(%[[CONDITION]]) : (i1) -> ()
+  // CHECK-NEXT: llvm.intr.assume %[[CONDITION]] : i1
   memref.assume_alignment %0, 16 : memref<4x4xf16, strided<[?, ?], offset: ?>>
   return
 }
diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir
index f9551e311df59f..0b7ca3f2bb048a 100644
--- a/mlir/test/Dialect/LLVMIR/inlining.mlir
+++ b/mlir/test/Dialect/LLVMIR/inlining.mlir
@@ -18,7 +18,7 @@ func.func @inner_func_inlinable(%ptr : !llvm.ptr) -> i32 {
   "llvm.intr.memset"(%ptr, %byte, %0) <{isVolatile = true}> : (!llvm.ptr, i8, i32) -> ()
   "llvm.intr.memmove"(%ptr, %ptr, %0) <{isVolatile = true}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
   "llvm.intr.memcpy"(%ptr, %ptr, %0) <{isVolatile = true}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
-  "llvm.intr.assume"(%true) : (i1) -> ()
+  llvm.intr.assume %true : i1
   llvm.fence release
   %2 = llvm.atomicrmw add %ptr, %0 monotonic : !llvm.ptr, i32
   %3 = llvm.cmpxchg %ptr, %0, %1 acq_rel monotonic : !llvm.ptr, i32
@@ -44,7 +44,7 @@ func.func @inner_func_inlinable(%ptr : !llvm.ptr) -> i32 {
 // CHECK: "llvm.intr.memset"(%[[PTR]]
 // CHECK: "llvm.intr.memmove"(%[[PTR]], %[[PTR]]
 // CHECK: "llvm.intr.memcpy"(%[[PTR]], %[[PTR]]
-// CHECK: "llvm.intr.assume"
+// CHECK: llvm.intr.assume
 // CHECK: llvm.fence release
 // CHECK: llvm.atomicrmw add %[[PTR]], %[[CST]] monotonic
 // CHECK: llvm.cmpxchg %[[PTR]], %[[CST]], %[[RES]] acq_rel monotonic
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index 3062cdc38c0abb..b8ce7db795a1d1 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -836,3 +836,30 @@ llvm.func @test_call_intrin_with_opbundle(%arg0 : !llvm.ptr) {
   llvm.call_intrinsic "llvm.assume"(%0) ["align"(%arg0, %1 : !llvm.ptr, i32)] : (i1) -> ()
   llvm.return
 }
+
+// CHECK-LABEL: @test_assume_intr_no_opbundle
+llvm.func @test_assume_intr_no_opbundle(%arg0 : !llvm.ptr) {
+  %0 = llvm.mlir.constant(1 : i1) : i1
+  // CHECK: llvm.intr.assume %0 : i1
+  llvm.intr.assume %0 : i1
+  llvm.return
+}
+
+// CHECK-LABEL: @test_assume_intr_empty_opbundle
+llvm.func @test_assume_intr_empty_opbundle(%arg0 : !llvm.ptr) {
+  %0 = llvm.mlir.constant(1 : i1) : i1
+  // CHECK: llvm.intr.assume %0 : i1
+  llvm.intr.assume %0 [] : i1
+  llvm.return
+}
+
+// CHECK-LABEL: @test_assume_intr_with_opbundles
+llvm.func @test_assume_intr_with_opbundles(%arg0 : !llvm.ptr) {
+  %0 = llvm.mlir.constant(1 : i1) : i1
+  %1 = llvm.mlir.constant(2 : i32) : i32
+  %2 = llvm.mlir.constant(3 : i32) : i32
+  %3 = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: llvm.intr.assume %0 ["tag1"(%1, %2 : i32, i32), "tag2"(%3 : i32)] : i1
+  llvm.intr.assume %0 ["tag1"(%1, %2 : i32, i32), "tag2"(%3 : i32)] : i1
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
index 28a1bd21c82a38..606b11175f572f 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
@@ -630,11 +630,21 @@ define void @va_intrinsics_test(ptr %0, ptr %1, ...) {
 ; CHECK-LABEL: @assume
 ; CHECK-SAME:  %[[TRUE:[a-zA-Z0-9]+]]
 define void @assume(i1 %true) {
-  ; CHECK:  "llvm.intr.assume"(%[[TRUE]]) : (i1) -> ()
+  ; CHECK:  llvm.intr.assume %[[TRUE]] : i1
   call void @llvm.assume(i1 %true)
   ret void
 }
 
+; CHECK-LABEL: @assume_with_opbundles
+; CHECK-SAME:  %[[TRUE:[a-zA-Z0-9]+]]
+; CHECK-SAME:  %[[PTR:[a-zA-Z0-9]+]]
+define void @assume_with_opbundles(i1 %true, ptr %p) {
+  ; CHECK: %[[ALIGN:.+]] = llvm.mlir.constant(8 : i32) : i32
+  ; CHECK:  llvm.intr.assume %[[TRUE]] ["align"(%[[PTR]], %[[ALIGN]] : !llvm.ptr, i32)] : i1
+  call void @llvm.assume(i1 %true) ["align"(ptr %p, i32 8)]
+  ret void
+}
+
 ; CHECK-LABEL: @is_constant
 ; CHECK-SAME:  %[[VAL:[a-zA-Z0-9]+]]
 define void @is_constant(i32 %0) {
diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
index 0634a7ba907f1e..cb712eb4e1262d 100644
--- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
@@ -363,6 +363,21 @@ llvm.func @umin_test(%arg0: i32, %arg1: i32, %arg2: vector<8xi32>, %arg3: vector
   llvm.return
 }
 
+// CHECK-LABEL: @assume_without_opbundles
+llvm.func @assume_without_opbundles(%cond: i1) {
+  // CHECK: call void @llvm.assume(i1 %{{.+}})
+  llvm.intr.assume %cond : i1
+  llvm.return
+}
+
+// CHECK-LABEL: @assume_with_opbundles
+llvm.func @assume_with_opbundles(%cond: i1, %p: !llvm.ptr) {
+  %0 = llvm.mlir.constant(8 : i32) : i32
+  // CHECK: call void @llvm.assume(i1 %{{.+}}) [ "align"(ptr %{{.+}}, i32 8) ]
+  llvm.intr.assume %cond ["align"(%p, %0 : !llvm.ptr, i32)] : i1
+  llvm.return
+}
+
 // CHECK-LABEL: @vector_reductions
 llvm.func @vector_reductions(%arg0: f32, %arg1: vector<8xf32>, %arg2: vector<8xi32>) {
   // CHECK: call i32 @llvm.vector.reduce.add.v8i32
diff --git a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
index af0981440a1776..15658ea6068121 100644
--- a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
@@ -188,7 +188,7 @@ llvm.func @sadd_overflow_intr_wrong_type(%arg0 : i32, %arg1 : f32) -> !llvm.stru
 
 llvm.func @assume_intr_wrong_type(%cond : i16) {
   // expected-error @below{{op operand #0 must be 1-bit signless integer, but got 'i16'}}
-  "llvm.intr.assume"(%cond) : (i16) -> ()
+  llvm.intr.assume %cond : i16
   llvm.return
 }
 

>From df0551298868b164197a4e54e9444120dc96ff53 Mon Sep 17 00:00:00 2001
From: VladiKrapp-Arm <vladi.krapp at arm.com>
Date: Wed, 16 Oct 2024 13:50:19 +0100
Subject: [PATCH 56/82] [ARM] Add test for thumb2-reduce-size NFC (#112333)

Check that t2MUL is reduced to tMUL
---
 llvm/test/CodeGen/Thumb2/avoidmuls.mir | 67 ++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 llvm/test/CodeGen/Thumb2/avoidmuls.mir

diff --git a/llvm/test/CodeGen/Thumb2/avoidmuls.mir b/llvm/test/CodeGen/Thumb2/avoidmuls.mir
new file mode 100644
index 00000000000000..8d5567482d5cd7
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/avoidmuls.mir
@@ -0,0 +1,67 @@
+# RUN: llc -run-pass=thumb2-reduce-size %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv8m.main-arm-none-eabi"
+
+  ; Function Attrs: norecurse nounwind readnone
+  define i32 @test(i32 %x, i32 %y) local_unnamed_addr #0 {
+  entry:
+    %cmp6 = icmp sgt i32 %y, 0
+    br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    %sum.0.lcssa = phi i32 [ 1, %entry ], [ %mul, %for.body ]
+    ret i32 %sum.0.lcssa
+
+  for.body:                                         ; preds = %for.body, %for.body.preheader
+    %lsr.iv1 = phi i32 [ %lsr.iv.next2, %for.body ], [ %x, %for.body.preheader ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ %y, %for.body.preheader ]
+    %sum.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ]
+    %mul = mul nsw i32 %lsr.iv1, %sum.07
+    %lsr.iv.next = add i32 %lsr.iv, -1
+    %lsr.iv.next2 = add i32 %lsr.iv1, 1
+    %exitcond = icmp eq i32 %lsr.iv.next, 0
+    br i1 %exitcond, label %for.cond.cleanup, label %for.body
+  }
+
+  attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m33" "target-features"="-d32,+dsp,+fp-armv8,-fp64,+hwdiv,+strict-align,+thumb-mode,-crc,-dotprod,-hwdiv-arm,-ras" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+body:             |
+  bb.0.entry:
+    successors: %bb.1.for.body, %bb.2.for.cond.cleanup
+    liveins: $r0, $r1
+
+    $r2 = tMOVr $r0, 14, _
+    $r0 = t2MOVi 1, 14, _, _
+    t2CMPri $r1, 1, 14, _, implicit-def $cpsr
+    t2Bcc %bb.2.for.cond.cleanup, 11, killed $cpsr
+
+  bb.1.for.body:
+    successors: %bb.2.for.cond.cleanup, %bb.1.for.body
+    liveins: $r0, $r1, $r2
+
+    $r0 = t2MUL $r2, killed $r0, 14, _
+    $r2 = t2ADDri killed $r2, 1, 14, _, _
+    $r1 = t2SUBri killed $r1, 1, 14, _, def $cpsr
+    t2Bcc %bb.1.for.body, 1, killed $cpsr
+
+  bb.2.for.cond.cleanup:
+    liveins: $r0
+
+    tBX_RET 14, _, implicit $r0
+
+...
+# CHECK-LABEL: test
+# CHECK: tMUL
+# CHECK-NOT: t2MUL

>From bac436bd56ebce290279aa15a40e7c99db3c3591 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis at chromium.org>
Date: Wed, 16 Oct 2024 09:23:35 -0400
Subject: [PATCH 57/82] [gn] port 5f2cf99e146c (lldb darwin loader tblgen)

---
 .../Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn     | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn
index 03e82576d67314..f9249c208d99b7 100644
--- a/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn
@@ -1,3 +1,14 @@
+import("//lldb/utils/TableGen/lldb_tablegen.gni")
+
+lldb_tablegen("DynamicLoaderDarwinProperties") {
+  args = [ "-gen-lldb-property-defs" ]
+}
+
+lldb_tablegen("DynamicLoaderDarwinPropertiesEnum") {
+  args = [ "-gen-lldb-property-enum-defs" ]
+  td_file = "DynamicLoaderDarwinProperties.td"
+}
+
 static_library("MacOSX-DYLD") {
   output_name = "lldbPluginDynamicLoaderMacOSXDYLD"
   configs += [
@@ -5,6 +16,8 @@ static_library("MacOSX-DYLD") {
     "//llvm/utils/gn/build:lldb_code",
   ]
   deps = [
+    ":DynamicLoaderDarwinProperties",
+    ":DynamicLoaderDarwinPropertiesEnum",
     "//lldb/source/Breakpoint",
     "//lldb/source/Core",
     "//lldb/source/Expression",
@@ -21,6 +34,7 @@ static_library("MacOSX-DYLD") {
   include_dirs = [ "//lldb/source" ]
   sources = [
     "DynamicLoaderDarwin.cpp",
+    "DynamicLoaderDarwinProperties.cpp",
     "DynamicLoaderMacOS.cpp",
     "DynamicLoaderMacOSXDYLD.cpp",
   ]

>From fa5d3f6eeb7ca1c58e387ca5513d0255e4874e96 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot at gmail.com>
Date: Wed, 16 Oct 2024 13:24:03 +0000
Subject: [PATCH 58/82] [gn build] Port 2e8ad49e7cff

---
 llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
index 5146c9a0f61ddd..85dfd7738c1703 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
@@ -16,7 +16,9 @@ static_library("Vectorize") {
     "SandboxVectorizer/DependencyGraph.cpp",
     "SandboxVectorizer/Interval.cpp",
     "SandboxVectorizer/Passes/BottomUpVec.cpp",
+    "SandboxVectorizer/Passes/RegionsFromMetadata.cpp",
     "SandboxVectorizer/SandboxVectorizer.cpp",
+    "SandboxVectorizer/SandboxVectorizerPassBuilder.cpp",
     "SandboxVectorizer/SeedCollector.cpp",
     "VPlan.cpp",
     "VPlanAnalysis.cpp",

>From 8ce0fb86d3acc066a16637ea5c5691da984707a7 Mon Sep 17 00:00:00 2001
From: "A. Jiang" <de34 at live.cn>
Date: Wed, 16 Oct 2024 21:32:02 +0800
Subject: [PATCH 59/82] [libc++][NFC] Reduce use of `__add_lvalue_reference_t`
 (#112497)

Currently, the occurrences of `__add_lvalue_reference_t` in
`__split_buffer` and `__assoc_state` are probably meaningless.

* In `__split_buffer`, the `__alloc_ref` and `__alloc_const_ref` member
  typedefs are no longer used.
* In `__assoc_state`, we should simply use `_Rp&`, which must be
  well-formed since it's already required that `sizeof(_Rp)` is
  well-formed.

This PR removes the meaningless usages. The remaining occurrences in
`shared_ptr`, `unique_ptr`, and several type traits are meaningful.
---
 libcxx/include/__split_buffer | 3 ---
 libcxx/include/future         | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/libcxx/include/__split_buffer b/libcxx/include/__split_buffer
index dfe552fbb45893..c4817601039f3b 100644
--- a/libcxx/include/__split_buffer
+++ b/libcxx/include/__split_buffer
@@ -80,9 +80,6 @@ public:
   pointer __end_;
   _LIBCPP_COMPRESSED_PAIR(pointer, __end_cap_, allocator_type, __alloc_);
 
-  using __alloc_ref       = __add_lvalue_reference_t<allocator_type>;
-  using __alloc_const_ref = __add_lvalue_reference_t<allocator_type>;
-
   __split_buffer(const __split_buffer&)            = delete;
   __split_buffer& operator=(const __split_buffer&) = delete;
 
diff --git a/libcxx/include/future b/libcxx/include/future
index dfa373d6593c79..f16f4234c48966 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -594,7 +594,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI void set_value_at_thread_exit(_Arg&& __arg);
 
   _LIBCPP_HIDE_FROM_ABI _Rp move();
-  _LIBCPP_HIDE_FROM_ABI __add_lvalue_reference_t<_Rp> copy();
+  _LIBCPP_HIDE_FROM_ABI _Rp& copy();
 };
 
 template <class _Rp>
@@ -636,7 +636,7 @@ _Rp __assoc_state<_Rp>::move() {
 }
 
 template <class _Rp>
-__add_lvalue_reference_t<_Rp> __assoc_state<_Rp>::copy() {
+_Rp& __assoc_state<_Rp>::copy() {
   unique_lock<mutex> __lk(this->__mut_);
   this->__sub_wait(__lk);
   if (this->__exception_ != nullptr)

>From e839d2a60ac3149f09b3cda0816cf5074075733c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 16 Oct 2024 14:27:38 +0100
Subject: [PATCH 60/82] [X86] andnot-patterns.ll - tweak #112425 test patterns
 to use separate source values for ANDNOT operands

---
 llvm/test/CodeGen/X86/andnot-patterns.ll | 488 +++++++++++------------
 1 file changed, 231 insertions(+), 257 deletions(-)

diff --git a/llvm/test/CodeGen/X86/andnot-patterns.ll b/llvm/test/CodeGen/X86/andnot-patterns.ll
index 0ff4e3b47ae46a..46ebe6ba76567a 100644
--- a/llvm/test/CodeGen/X86/andnot-patterns.ll
+++ b/llvm/test/CodeGen/X86/andnot-patterns.ll
@@ -10,18 +10,14 @@ declare void @use_i64(i64)
 ; Fold (and X, (rotl (not Y), Z))) -> (and X, (not (rotl Y, Z)))
 ;
 
-define i64 @andnot_rotl_i64(i64 %a0, i64 %a1) nounwind {
+define i64 @andnot_rotl_i64(i64 %a0, i64 %a1, i64 %a2) nounwind {
 ; X86-LABEL: andnot_rotl_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    notl %esi
 ; X86-NEXT:    notl %edx
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    jne .LBB0_1
@@ -29,116 +25,112 @@ define i64 @andnot_rotl_i64(i64 %a0, i64 %a1) nounwind {
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    jmp .LBB0_3
 ; X86-NEXT:  .LBB0_1:
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:  .LBB0_3:
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shldl %cl, %eax, %edx
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    andl %edi, %eax
-; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_rotl_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    notq %rax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NEXT:    rolq %cl, %rax
 ; X64-NEXT:    andq %rdi, %rax
 ; X64-NEXT:    retq
-  %not = xor i64 %a0, -1
-  %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a1)
+  %not = xor i64 %a1, -1
+  %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a2)
   %and = and i64 %rot, %a0
   ret i64 %and
 }
 
-define i32 @andnot_rotl_i32(i32 %a0, i32 %a1) nounwind {
+define i32 @andnot_rotl_i32(i32 %a0, i32 %a1, i32 %a2) nounwind {
 ; X86-LABEL: andnot_rotl_i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notl %eax
 ; X86-NEXT:    roll %cl, %eax
-; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_rotl_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    notl %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    roll %cl, %eax
 ; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    retq
-  %not = xor i32 %a0, -1
-  %rot = tail call i32 @llvm.fshl.i32(i32 %not, i32 %not, i32 %a1)
+  %not = xor i32 %a1, -1
+  %rot = tail call i32 @llvm.fshl.i32(i32 %not, i32 %not, i32 %a2)
   %and = and i32 %rot, %a0
   ret i32 %and
 }
 
-define i16 @andnot_rotl_i16(i16 %a0, i16 %a1) nounwind {
+define i16 @andnot_rotl_i16(i16 %a0, i16 %a1, i16 %a2) nounwind {
 ; X86-LABEL: andnot_rotl_i16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notl %eax
 ; X86-NEXT:    rolw %cl, %ax
-; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_rotl_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    notl %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    rolw %cl, %ax
 ; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
-  %not = xor i16 %a0, -1
-  %rot = tail call i16 @llvm.fshl.i16(i16 %not, i16 %not, i16 %a1)
+  %not = xor i16 %a1, -1
+  %rot = tail call i16 @llvm.fshl.i16(i16 %not, i16 %not, i16 %a2)
   %and = and i16 %rot, %a0
   ret i16 %and
 }
 
-define i8 @andnot_rotl_i8(i8 %a0, i8 %a1) nounwind {
+define i8 @andnot_rotl_i8(i8 %a0, i8 %a1, i8 %a2) nounwind {
 ; X86-LABEL: andnot_rotl_i8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notb %al
 ; X86-NEXT:    rolb %cl, %al
-; X86-NEXT:    andb %dl, %al
+; X86-NEXT:    andb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_rotl_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    notb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    rolb %cl, %al
 ; X64-NEXT:    andb %dil, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
-  %not = xor i8 %a0, -1
-  %rot = tail call i8 @llvm.fshl.i8(i8 %not, i8 %not, i8 %a1)
+  %not = xor i8 %a1, -1
+  %rot = tail call i8 @llvm.fshl.i8(i8 %not, i8 %not, i8 %a2)
   %and = and i8 %rot, %a0
   ret i8 %and
 }
 
-define i64 @andnot_rotl_i64_multiuse(i64 %a0, i64 %a1) nounwind {
+define i64 @andnot_rotl_i64_multiuse(i64 %a0, i64 %a1, i64 %a2) nounwind {
 ; X86-LABEL: andnot_rotl_i64_multiuse:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
@@ -146,28 +138,28 @@ define i64 @andnot_rotl_i64_multiuse(i64 %a0, i64 %a1) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    notl %eax
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    notl %esi
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    jne .LBB4_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    jmp .LBB4_3
 ; X86-NEXT:  .LBB4_1:
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:  .LBB4_3:
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    shldl %cl, %edx, %ebx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    shldl %cl, %eax, %ebx
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    andl %edx, %esi
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    andl %ebx, %edi
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll use_i64 at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    movl %esi, %eax
@@ -180,18 +172,19 @@ define i64 @andnot_rotl_i64_multiuse(i64 %a0, i64 %a1) nounwind {
 ; X64-LABEL: andnot_rotl_i64_multiuse:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rdi, %rbx
-; X64-NEXT:    notq %rdi
+; X64-NEXT:    notq %rsi
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NEXT:    rolq %cl, %rdi
-; X64-NEXT:    andq %rdi, %rbx
+; X64-NEXT:    rolq %cl, %rsi
+; X64-NEXT:    andq %rsi, %rbx
+; X64-NEXT:    movq %rsi, %rdi
 ; X64-NEXT:    callq use_i64 at PLT
 ; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
-  %not = xor i64 %a0, -1
-  %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a1)
+  %not = xor i64 %a1, -1
+  %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a2)
   %and = and i64 %rot, %a0
   call void @use_i64(i64 %rot)
   ret i64 %and
@@ -201,130 +194,122 @@ define i64 @andnot_rotl_i64_multiuse(i64 %a0, i64 %a1) nounwind {
 ; Fold (and X, (rotr (not Y), Z))) -> (and X, (not (rotr Y, Z)))
 ;
 
-define i64 @andnot_rotr_i64(i64 %a0, i64 %a1) nounwind {
+define i64 @andnot_rotr_i64(i64 %a0, i64 %a1, i64 %a2) nounwind {
 ; X86-LABEL: andnot_rotr_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    notl %esi
 ; X86-NEXT:    notl %edx
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    jne .LBB5_1
+; X86-NEXT:    je .LBB5_1
 ; X86-NEXT:  # %bb.2:
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    jmp .LBB5_3
 ; X86-NEXT:  .LBB5_1:
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:  .LBB5_3:
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    shrdl %cl, %eax, %edx
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    andl %edi, %eax
-; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    shrdl %cl, %esi, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_rotr_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    notq %rax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NEXT:    rolq %cl, %rax
+; X64-NEXT:    rorq %cl, %rax
 ; X64-NEXT:    andq %rdi, %rax
 ; X64-NEXT:    retq
-  %not = xor i64 %a0, -1
-  %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a1)
+  %not = xor i64 %a1, -1
+  %rot = tail call i64 @llvm.fshr.i64(i64 %not, i64 %not, i64 %a2)
   %and = and i64 %rot, %a0
   ret i64 %and
 }
 
-define i32 @andnot_rotr_i32(i32 %a0, i32 %a1) nounwind {
+define i32 @andnot_rotr_i32(i32 %a0, i32 %a1, i32 %a2) nounwind {
 ; X86-LABEL: andnot_rotr_i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notl %eax
-; X86-NEXT:    roll %cl, %eax
-; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    rorl %cl, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_rotr_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    notl %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    roll %cl, %eax
+; X64-NEXT:    rorl %cl, %eax
 ; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    retq
-  %not = xor i32 %a0, -1
-  %rot = tail call i32 @llvm.fshl.i32(i32 %not, i32 %not, i32 %a1)
+  %not = xor i32 %a1, -1
+  %rot = tail call i32 @llvm.fshr.i32(i32 %not, i32 %not, i32 %a2)
   %and = and i32 %rot, %a0
   ret i32 %and
 }
 
-define i16 @andnot_rotr_i16(i16 %a0, i16 %a1) nounwind {
+define i16 @andnot_rotr_i16(i16 %a0, i16 %a1, i16 %a2) nounwind {
 ; X86-LABEL: andnot_rotr_i16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notl %eax
-; X86-NEXT:    rolw %cl, %ax
-; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    rorw %cl, %ax
+; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_rotr_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    notl %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    rolw %cl, %ax
+; X64-NEXT:    rorw %cl, %ax
 ; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
-  %not = xor i16 %a0, -1
-  %rot = tail call i16 @llvm.fshl.i16(i16 %not, i16 %not, i16 %a1)
+  %not = xor i16 %a1, -1
+  %rot = tail call i16 @llvm.fshr.i16(i16 %not, i16 %not, i16 %a2)
   %and = and i16 %rot, %a0
   ret i16 %and
 }
 
-define i8 @andnot_rotr_i8(i8 %a0, i8 %a1) nounwind {
+define i8 @andnot_rotr_i8(i8 %a0, i8 %a1, i8 %a2) nounwind {
 ; X86-LABEL: andnot_rotr_i8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notb %al
-; X86-NEXT:    rolb %cl, %al
-; X86-NEXT:    andb %dl, %al
+; X86-NEXT:    rorb %cl, %al
+; X86-NEXT:    andb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_rotr_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    notb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    rolb %cl, %al
+; X64-NEXT:    rorb %cl, %al
 ; X64-NEXT:    andb %dil, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
-  %not = xor i8 %a0, -1
-  %rot = tail call i8 @llvm.fshl.i8(i8 %not, i8 %not, i8 %a1)
+  %not = xor i8 %a1, -1
+  %rot = tail call i8 @llvm.fshr.i8(i8 %not, i8 %not, i8 %a2)
   %and = and i8 %rot, %a0
   ret i8 %and
 }
@@ -333,76 +318,73 @@ define i8 @andnot_rotr_i8(i8 %a0, i8 %a1) nounwind {
 ; Fold (and X, (bswap (not Y)))) -> (and X, (not (bswap Y)))
 ;
 
-define i64 @andnot_bswap_i64(i64 %a0) nounwind {
+define i64 @andnot_bswap_i64(i64 %a0, i64 %a1) nounwind {
 ; X86-LABEL: andnot_bswap_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    notl %eax
 ; X86-NEXT:    notl %edx
 ; X86-NEXT:    bswapl %edx
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    notl %eax
 ; X86-NEXT:    bswapl %eax
-; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_bswap_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    notq %rax
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    andq %rdi, %rax
 ; X64-NEXT:    retq
-  %not = xor i64 %a0, -1
+  %not = xor i64 %a1, -1
   %bswap = tail call i64 @llvm.bswap.i64(i64 %not)
   %and = and i64 %bswap, %a0
   ret i64 %and
 }
 
-define i32 @andnot_bswap_i32(i32 %a0) nounwind {
+define i32 @andnot_bswap_i32(i32 %a0, i32 %a1) nounwind {
 ; X86-LABEL: andnot_bswap_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notl %eax
 ; X86-NEXT:    bswapl %eax
-; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_bswap_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    notl %eax
 ; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    retq
-  %not = xor i32 %a0, -1
+  %not = xor i32 %a1, -1
   %bswap = tail call i32 @llvm.bswap.i32(i32 %not)
   %and = and i32 %bswap, %a0
   ret i32 %and
 }
 
-define i16 @andnot_bswap_i16(i16 %a0) nounwind {
+define i16 @andnot_bswap_i16(i16 %a0, i16 %a1) nounwind {
 ; X86-LABEL: andnot_bswap_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notl %eax
 ; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_bswap_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    notl %eax
 ; X64-NEXT:    rolw $8, %ax
 ; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
-  %not = xor i16 %a0, -1
+  %not = xor i16 %a1, -1
   %bswap = tail call i16 @llvm.bswap.i16(i16 %not)
   %and = and i16 %bswap, %a0
   ret i16 %and
@@ -412,72 +394,68 @@ define i16 @andnot_bswap_i16(i16 %a0) nounwind {
 ; Fold (and X, (bitreverse (not Y)))) -> (and X, (not (bitreverse Y)))
 ;
 
-define i64 @andnot_bitreverse_i64(i64 %a0) nounwind {
+define i64 @andnot_bitreverse_i64(i64 %a0, i64 %a1) nounwind {
 ; X86-LABEL: andnot_bitreverse_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    notl %edx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %esi
-; X86-NEXT:    shrl $4, %edx
 ; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NEXT:    leal (%edx,%esi,4), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; X86-NEXT:    shrl %edx
+; X86-NEXT:    shrl $2, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
-; X86-NEXT:    leal (%edx,%esi,2), %edx
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    notl %eax
+; X86-NEXT:    shrl %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    leal (%ecx,%edx,2), %edx
 ; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
 ; X86-NEXT:    shrl $4, %eax
 ; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
 ; X86-NEXT:    shrl $2, %eax
 ; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%esi,4), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%esi,2), %eax
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_bitreverse_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    notq %rax
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    shrq $4, %rcx
-; X64-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %rdx, %rcx
-; X64-NEXT:    andq %rdx, %rax
-; X64-NEXT:    shlq $4, %rax
-; X64-NEXT:    orq %rcx, %rax
-; X64-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    andq %rcx, %rdx
-; X64-NEXT:    shrq $2, %rax
+; X64-NEXT:    notq %rsi
+; X64-NEXT:    bswapq %rsi
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    shrq $4, %rax
+; X64-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
 ; X64-NEXT:    andq %rcx, %rax
-; X64-NEXT:    leaq (%rax,%rdx,4), %rax
+; X64-NEXT:    andq %rcx, %rsi
+; X64-NEXT:    shlq $4, %rsi
+; X64-NEXT:    orq %rax, %rsi
+; X64-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    andq %rax, %rcx
+; X64-NEXT:    shrq $2, %rsi
+; X64-NEXT:    andq %rax, %rsi
+; X64-NEXT:    leaq (%rsi,%rcx,4), %rax
 ; X64-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
 ; X64-NEXT:    movq %rax, %rdx
 ; X64-NEXT:    andq %rcx, %rdx
@@ -486,54 +464,53 @@ define i64 @andnot_bitreverse_i64(i64 %a0) nounwind {
 ; X64-NEXT:    leaq (%rax,%rdx,2), %rax
 ; X64-NEXT:    andq %rdi, %rax
 ; X64-NEXT:    retq
-  %not = xor i64 %a0, -1
+  %not = xor i64 %a1, -1
   %bitrev = tail call i64 @llvm.bitreverse.i64(i64 %not)
   %and = and i64 %bitrev, %a0
   ret i64 %and
 }
 
-define i32 @andnot_bitreverse_i32(i32 %a0) nounwind {
+define i32 @andnot_bitreverse_i32(i32 %a0, i32 %a1) nounwind {
 ; X86-LABEL: andnot_bitreverse_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notl %eax
 ; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
 ; X86-NEXT:    shrl $4, %eax
 ; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
 ; X86-NEXT:    shrl $2, %eax
 ; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%edx,4), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%edx,2), %eax
-; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_bitreverse_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    notl %eax
-; X64-NEXT:    bswapl %eax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X64-NEXT:    shll $4, %ecx
-; X64-NEXT:    shrl $4, %eax
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    notl %esi
+; X64-NEXT:    bswapl %esi
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X64-NEXT:    shrl $2, %eax
+; X64-NEXT:    shll $4, %eax
+; X64-NEXT:    shrl $4, %esi
+; X64-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X64-NEXT:    orl %eax, %esi
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X64-NEXT:    leal (%rax,%rcx,4), %eax
+; X64-NEXT:    shrl $2, %esi
+; X64-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X64-NEXT:    leal (%rsi,%rax,4), %eax
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
 ; X64-NEXT:    shrl %eax
@@ -541,55 +518,54 @@ define i32 @andnot_bitreverse_i32(i32 %a0) nounwind {
 ; X64-NEXT:    leal (%rax,%rcx,2), %eax
 ; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    retq
-  %not = xor i32 %a0, -1
+  %not = xor i32 %a1, -1
   %bitrev = tail call i32 @llvm.bitreverse.i32(i32 %not)
   %and = and i32 %bitrev, %a0
   ret i32 %and
 }
 
-define i16 @andnot_bitreverse_i16(i16 %a0) nounwind {
+define i16 @andnot_bitreverse_i16(i16 %a0, i16 %a1) nounwind {
 ; X86-LABEL: andnot_bitreverse_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notl %eax
 ; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl $3855, %edx # imm = 0xF0F
-; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $3855, %ecx # imm = 0xF0F
+; X86-NEXT:    shll $4, %ecx
 ; X86-NEXT:    shrl $4, %eax
 ; X86-NEXT:    andl $3855, %eax # imm = 0xF0F
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl $13107, %edx # imm = 0x3333
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $13107, %ecx # imm = 0x3333
 ; X86-NEXT:    shrl $2, %eax
 ; X86-NEXT:    andl $13107, %eax # imm = 0x3333
-; X86-NEXT:    leal (%eax,%edx,4), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl $21845, %edx # imm = 0x5555
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $21845, %ecx # imm = 0x5555
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    andl $21845, %eax # imm = 0x5555
-; X86-NEXT:    leal (%eax,%edx,2), %eax
-; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_bitreverse_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    notl %eax
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $3855, %ecx # imm = 0xF0F
-; X64-NEXT:    shll $4, %ecx
-; X64-NEXT:    shrl $4, %eax
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    notl %esi
+; X64-NEXT:    rolw $8, %si
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    andl $3855, %eax # imm = 0xF0F
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $13107, %ecx # imm = 0x3333
-; X64-NEXT:    shrl $2, %eax
+; X64-NEXT:    shll $4, %eax
+; X64-NEXT:    shrl $4, %esi
+; X64-NEXT:    andl $3855, %esi # imm = 0xF0F
+; X64-NEXT:    orl %eax, %esi
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    andl $13107, %eax # imm = 0x3333
-; X64-NEXT:    leal (%rax,%rcx,4), %eax
+; X64-NEXT:    shrl $2, %esi
+; X64-NEXT:    andl $13107, %esi # imm = 0x3333
+; X64-NEXT:    leal (%rsi,%rax,4), %eax
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $21845, %ecx # imm = 0x5555
 ; X64-NEXT:    shrl %eax
@@ -598,45 +574,43 @@ define i16 @andnot_bitreverse_i16(i16 %a0) nounwind {
 ; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
-  %not = xor i16 %a0, -1
+  %not = xor i16 %a1, -1
   %bitrev = tail call i16 @llvm.bitreverse.i16(i16 %not)
   %and = and i16 %bitrev, %a0
   ret i16 %and
 }
 
-define i8 @andnot_bitreverse_i8(i8 %a0) nounwind {
+define i8 @andnot_bitreverse_i8(i8 %a0, i8 %a1) nounwind {
 ; X86-LABEL: andnot_bitreverse_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notb %al
 ; X86-NEXT:    rolb $4, %al
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andb $51, %dl
-; X86-NEXT:    shlb $2, %dl
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andb $51, %cl
+; X86-NEXT:    shlb $2, %cl
 ; X86-NEXT:    shrb $2, %al
 ; X86-NEXT:    andb $51, %al
-; X86-NEXT:    orb %dl, %al
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andb $85, %dl
-; X86-NEXT:    addb %dl, %dl
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andb $85, %cl
+; X86-NEXT:    addb %cl, %cl
 ; X86-NEXT:    shrb %al
 ; X86-NEXT:    andb $85, %al
-; X86-NEXT:    orb %dl, %al
-; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    andb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_bitreverse_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    notb %al
-; X64-NEXT:    rolb $4, %al
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andb $51, %cl
-; X64-NEXT:    shlb $2, %cl
-; X64-NEXT:    shrb $2, %al
+; X64-NEXT:    notb %sil
+; X64-NEXT:    rolb $4, %sil
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    andb $51, %al
-; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    shlb $2, %al
+; X64-NEXT:    shrb $2, %sil
+; X64-NEXT:    andb $51, %sil
+; X64-NEXT:    orb %sil, %al
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andb $85, %cl
 ; X64-NEXT:    addb %cl, %cl
@@ -645,7 +619,7 @@ define i8 @andnot_bitreverse_i8(i8 %a0) nounwind {
 ; X64-NEXT:    orb %cl, %al
 ; X64-NEXT:    andb %dil, %al
 ; X64-NEXT:    retq
-  %not = xor i8 %a0, -1
+  %not = xor i8 %a1, -1
   %bitrev = tail call i8 @llvm.bitreverse.i8(i8 %not)
   %and = and i8 %bitrev, %a0
   ret i8 %and

>From cbe03646c620d08da69eac241450f32f4025c635 Mon Sep 17 00:00:00 2001
From: "A. Jiang" <de34 at live.cn>
Date: Wed, 16 Oct 2024 21:34:31 +0800
Subject: [PATCH 61/82] [libc++][ranges] LWG3692: `zip_view::iterator`'s
 `operator<=>` is overconstrained and changes of `zip_view` in P2165R4
 (#112077)

The changes are nearly pure simplifications, so I think it's OK to do
them together in the same PR.

Actual test coverages were already added in commit ad41d1e26b12
(https://reviews.llvm.org/D141216). Thanks to Casey Carter!

Fixes #104975
Towards #105200
---
 libcxx/docs/Status/Cxx23Issues.csv            |  2 +-
 libcxx/docs/Status/Cxx23Papers.csv            |  2 +-
 libcxx/include/__ranges/zip_view.h            | 55 +++----------------
 .../range.adaptors/range.zip/cpo.pass.cpp     |  4 --
 .../range.zip/ctor.default.pass.cpp           |  6 +-
 .../range.zip/iterator/compare.pass.cpp       | 16 +-----
 .../range.zip/iterator/deref.pass.cpp         |  8 ---
 .../iterator/member_types.compile.pass.cpp    | 14 +----
 .../range.zip/iterator/subscript.pass.cpp     |  8 ---
 9 files changed, 13 insertions(+), 102 deletions(-)

diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv
index 63e4176ecba1d7..cfa721230e5fb8 100644
--- a/libcxx/docs/Status/Cxx23Issues.csv
+++ b/libcxx/docs/Status/Cxx23Issues.csv
@@ -168,7 +168,7 @@
 "`LWG3672 <https://wg21.link/LWG3672>`__","``common_iterator::operator->()`` should return by value","2022-07 (Virtual)","|Complete|","19.0",""
 "`LWG3683 <https://wg21.link/LWG3683>`__","``operator==`` for ``polymorphic_allocator`` cannot deduce template argument in common cases","2022-07 (Virtual)","|Complete|","20.0",""
 "`LWG3687 <https://wg21.link/LWG3687>`__","``expected<cv void, E>`` move constructor should move","2022-07 (Virtual)","|Complete|","16.0",""
-"`LWG3692 <https://wg21.link/LWG3692>`__","``zip_view::iterator``'s ``operator<=>`` is overconstrained","2022-07 (Virtual)","","",""
+"`LWG3692 <https://wg21.link/LWG3692>`__","``zip_view::iterator``'s ``operator<=>`` is overconstrained","2022-07 (Virtual)","|Complete|","20.0",""
 "`LWG3701 <https://wg21.link/LWG3701>`__","Make ``formatter<remove_cvref_t<const charT[N]>, charT>`` requirement explicit","2022-07 (Virtual)","|Complete|","15.0",""
 "`LWG3702 <https://wg21.link/LWG3702>`__","Should ``zip_transform_view::iterator`` remove ``operator<``","2022-07 (Virtual)","","",""
 "`LWG3703 <https://wg21.link/LWG3703>`__","Missing requirements for ``expected<T, E>`` requires ``is_void<T>``","2022-07 (Virtual)","|Complete|","16.0",""
diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv
index da7b5881877135..c64f1c4171fce1 100644
--- a/libcxx/docs/Status/Cxx23Papers.csv
+++ b/libcxx/docs/Status/Cxx23Papers.csv
@@ -60,7 +60,7 @@
 "`P1642R11 <https://wg21.link/P1642R11>`__","Freestanding ``[utilities]``, ``[ranges]``, and ``[iterators]``","2022-07 (Virtual)","","",""
 "`P1899R3 <https://wg21.link/P1899R3>`__","``stride_view``","2022-07 (Virtual)","","",""
 "`P2093R14 <https://wg21.link/P2093R14>`__","Formatted output","2022-07 (Virtual)","|Complete|","18.0",""
-"`P2165R4 <https://wg21.link/P2165R4>`__","Compatibility between ``tuple``, ``pair`` and ``tuple-like`` objects","2022-07 (Virtual)","","",""
+"`P2165R4 <https://wg21.link/P2165R4>`__","Compatibility between ``tuple``, ``pair`` and ``tuple-like`` objects","2022-07 (Virtual)","|Partial|","","Only the part for ``zip_view`` is implemented."
 "`P2278R4 <https://wg21.link/P2278R4>`__","``cbegin`` should always return a constant iterator","2022-07 (Virtual)","","",""
 "`P2286R8 <https://wg21.link/P2286R8>`__","Formatting Ranges","2022-07 (Virtual)","|Complete|","16.0",""
 "`P2291R3 <https://wg21.link/P2291R3>`__","Add Constexpr Modifiers to Functions ``to_chars`` and ``from_chars`` for Integral Types in ``<charconv>`` Header","2022-07 (Virtual)","|Complete|","16.0",""
diff --git a/libcxx/include/__ranges/zip_view.h b/libcxx/include/__ranges/zip_view.h
index fe3c87a9306fe9..835e23cb23af1b 100644
--- a/libcxx/include/__ranges/zip_view.h
+++ b/libcxx/include/__ranges/zip_view.h
@@ -36,7 +36,6 @@
 #include <__utility/forward.h>
 #include <__utility/integer_sequence.h>
 #include <__utility/move.h>
-#include <__utility/pair.h>
 #include <tuple>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -58,22 +57,11 @@ concept __zip_is_common =
     (!(bidirectional_range<_Ranges> && ...) && (common_range<_Ranges> && ...)) ||
     ((random_access_range<_Ranges> && ...) && (sized_range<_Ranges> && ...));
 
-template <typename _Tp, typename _Up>
-auto __tuple_or_pair_test() -> pair<_Tp, _Up>;
-
-template <typename... _Types>
-  requires(sizeof...(_Types) != 2)
-auto __tuple_or_pair_test() -> tuple<_Types...>;
-
-template <class... _Types>
-using __tuple_or_pair = decltype(__tuple_or_pair_test<_Types...>());
-
 template <class _Fun, class _Tuple>
 _LIBCPP_HIDE_FROM_ABI constexpr auto __tuple_transform(_Fun&& __f, _Tuple&& __tuple) {
   return std::apply(
       [&]<class... _Types>(_Types&&... __elements) {
-        return __tuple_or_pair<invoke_result_t<_Fun&, _Types>...>(
-            std::invoke(__f, std::forward<_Types>(__elements))...);
+        return tuple<invoke_result_t<_Fun&, _Types>...>(std::invoke(__f, std::forward<_Types>(__elements))...);
       },
       std::forward<_Tuple>(__tuple));
 }
@@ -88,7 +76,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr void __tuple_for_each(_Fun&& __f, _Tuple&& __tup
 }
 
 template <class _Fun, class _Tuple1, class _Tuple2, size_t... _Indices>
-_LIBCPP_HIDE_FROM_ABI constexpr __tuple_or_pair<
+_LIBCPP_HIDE_FROM_ABI constexpr tuple<
     invoke_result_t<_Fun&,
                     typename tuple_element<_Indices, remove_cvref_t<_Tuple1>>::type,
                     typename tuple_element<_Indices, remove_cvref_t<_Tuple2>>::type>...>
@@ -250,10 +238,9 @@ template <input_range... _Views>
   requires(view<_Views> && ...) && (sizeof...(_Views) > 0)
 template <bool _Const>
 class zip_view<_Views...>::__iterator : public __zip_view_iterator_category_base<_Const, _Views...> {
-  __tuple_or_pair<iterator_t<__maybe_const<_Const, _Views>>...> __current_;
+  tuple<iterator_t<__maybe_const<_Const, _Views>>...> __current_;
 
-  _LIBCPP_HIDE_FROM_ABI constexpr explicit __iterator(
-      __tuple_or_pair<iterator_t<__maybe_const<_Const, _Views>>...> __current)
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __iterator(tuple<iterator_t<__maybe_const<_Const, _Views>>...> __current)
       : __current_(std::move(__current)) {}
 
   template <bool>
@@ -266,7 +253,7 @@ class zip_view<_Views...>::__iterator : public __zip_view_iterator_category_base
 
 public:
   using iterator_concept = decltype(__get_zip_view_iterator_tag<_Const, _Views...>());
-  using value_type       = __tuple_or_pair<range_value_t<__maybe_const<_Const, _Views>>...>;
+  using value_type       = tuple<range_value_t<__maybe_const<_Const, _Views>>...>;
   using difference_type  = common_type_t<range_difference_t<__maybe_const<_Const, _Views>>...>;
 
   _LIBCPP_HIDE_FROM_ABI __iterator() = default;
@@ -340,33 +327,8 @@ class zip_view<_Views...>::__iterator : public __zip_view_iterator_category_base
     }
   }
 
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator<(const __iterator& __x, const __iterator& __y)
-    requires __zip_all_random_access<_Const, _Views...>
-  {
-    return __x.__current_ < __y.__current_;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator>(const __iterator& __x, const __iterator& __y)
-    requires __zip_all_random_access<_Const, _Views...>
-  {
-    return __y < __x;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator<=(const __iterator& __x, const __iterator& __y)
-    requires __zip_all_random_access<_Const, _Views...>
-  {
-    return !(__y < __x);
-  }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator>=(const __iterator& __x, const __iterator& __y)
-    requires __zip_all_random_access<_Const, _Views...>
-  {
-    return !(__x < __y);
-  }
-
   _LIBCPP_HIDE_FROM_ABI friend constexpr auto operator<=>(const __iterator& __x, const __iterator& __y)
-    requires __zip_all_random_access<_Const, _Views...> &&
-             (three_way_comparable<iterator_t<__maybe_const<_Const, _Views>>> && ...)
+    requires __zip_all_random_access<_Const, _Views...>
   {
     return __x.__current_ <=> __y.__current_;
   }
@@ -427,10 +389,9 @@ template <input_range... _Views>
   requires(view<_Views> && ...) && (sizeof...(_Views) > 0)
 template <bool _Const>
 class zip_view<_Views...>::__sentinel {
-  __tuple_or_pair<sentinel_t<__maybe_const<_Const, _Views>>...> __end_;
+  tuple<sentinel_t<__maybe_const<_Const, _Views>>...> __end_;
 
-  _LIBCPP_HIDE_FROM_ABI constexpr explicit __sentinel(
-      __tuple_or_pair<sentinel_t<__maybe_const<_Const, _Views>>...> __end)
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __sentinel(tuple<sentinel_t<__maybe_const<_Const, _Views>>...> __end)
       : __end_(__end) {}
 
   friend class zip_view<_Views...>;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp
index ea5953cefa0ff3..bdfd58ff8bbe78 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp
@@ -63,11 +63,7 @@ constexpr bool test() {
         std::ranges::zip_view<std::ranges::zip_view<SizedRandomAccessView, SizedRandomAccessView>>> decltype(auto) v2 =
         std::views::zip(v);
 
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
-    static_assert(std::is_same_v<std::ranges::range_reference_t<decltype(v2)>, std::tuple<std::pair<int&, int&>>>);
-#else
     static_assert(std::is_same_v<std::ranges::range_reference_t<decltype(v2)>, std::tuple<std::tuple<int&, int&>>>);
-#endif
   }
   return true;
 }
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.default.pass.cpp
index f53289621eabba..fdfcc02a8fb1c1 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.default.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.default.pass.cpp
@@ -49,12 +49,8 @@ constexpr bool test() {
     using View = std::ranges::zip_view<DefaultConstructibleView, DefaultConstructibleView>;
     View v = View(); // the default constructor is not explicit
     assert(v.size() == 3);
-    auto it = v.begin();
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
-    using Value = std::pair<const int&, const int&>;
-#else
+    auto it     = v.begin();
     using Value = std::tuple<const int&, const int&>;
-#endif
     assert(*it++ == Value(buff[0], buff[0]));
     assert(*it++ == Value(buff[1], buff[1]));
     assert(*it == Value(buff[2], buff[2]));
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp
index ed1cb0ccebd2b5..8ab7346800093e 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp
@@ -10,17 +10,8 @@
 
 // friend constexpr bool operator==(const iterator& x, const iterator& y)
 //   requires (equality_comparable<iterator_t<maybe-const<Const, Views>>> && ...);
-// friend constexpr bool operator<(const iterator& x, const iterator& y)
-//   requires all-random-access<Const, Views...>;
-// friend constexpr bool operator>(const iterator& x, const iterator& y)
-//   requires all-random-access<Const, Views...>;
-// friend constexpr bool operator<=(const iterator& x, const iterator& y)
-//   requires all-random-access<Const, Views...>;
-// friend constexpr bool operator>=(const iterator& x, const iterator& y)
-//   requires all-random-access<Const, Views...>;
 // friend constexpr auto operator<=>(const iterator& x, const iterator& y)
-//   requires all-random-access<Const, Views...> &&
-//            (three_way_comparable<iterator_t<maybe-const<Const, Views>>> && ...);
+//   requires all-random-access<Const, Views...>;
 
 #include <ranges>
 #include <compare>
@@ -165,12 +156,7 @@ constexpr bool test() {
     using Subrange = std::ranges::subrange<It>;
     static_assert(!std::three_way_comparable<It>);
     using R = std::ranges::zip_view<Subrange, Subrange>;
-#ifdef _LIBCPP_VERSION
-    // libc++ hasn't implemented LWG-3692 "zip_view::iterator's operator<=> is overconstrained"
-    static_assert(!std::three_way_comparable<std::ranges::iterator_t<R>>);
-#else
     static_assert(std::three_way_comparable<std::ranges::iterator_t<R>>);
-#endif
 
     int a[] = {1, 2, 3, 4};
     int b[] = {5, 6, 7, 8, 9};
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp
index 569d0409721941..fb58aa28fbdf8d 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp
@@ -42,11 +42,7 @@ constexpr bool test() {
     auto [x, y] = *it;
     assert(&x == &(a[0]));
     assert(&y == &(b[0]));
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
-    static_assert(std::is_same_v<decltype(*it), std::pair<int&, double&>>);
-#else
     static_assert(std::is_same_v<decltype(*it), std::tuple<int&, double&>>);
-#endif
 
     x = 5;
     y = 0.1;
@@ -70,11 +66,7 @@ constexpr bool test() {
     auto it = v.begin();
     assert(&(std::get<0>(*it)) == &(a[0]));
     assert(&(std::get<1>(*it)) == &(a[0]));
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
-    static_assert(std::is_same_v<decltype(*it), std::pair<int&, int const&>>);
-#else
     static_assert(std::is_same_v<decltype(*it), std::tuple<int&, int const&>>);
-#endif
   }
   return true;
 }
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp
index c19f6c2b16524f..2f2f0fc4f4e331 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp
@@ -65,7 +65,7 @@ struct ConstVeryDifferentRange {
 void test() {
   int buffer[] = {1, 2, 3, 4};
   {
-    // 2 views should have pair value_type
+    // 2 views should have 2-tuple value_type
     // random_access_iterator_tag
     std::ranges::zip_view v(buffer, buffer);
     using Iter = decltype(v.begin());
@@ -73,11 +73,7 @@ void test() {
     static_assert(std::is_same_v<Iter::iterator_concept, std::random_access_iterator_tag>);
     static_assert(std::is_same_v<Iter::iterator_category, std::input_iterator_tag>);
     static_assert(std::is_same_v<Iter::difference_type, std::ptrdiff_t>);
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
-    static_assert(std::is_same_v<Iter::value_type, std::pair<int, int>>);
-#else
     static_assert(std::is_same_v<Iter::value_type, std::tuple<int, int>>);
-#endif
     static_assert(HasIterCategory<Iter>);
   }
 
@@ -124,11 +120,7 @@ void test() {
     static_assert(std::is_same_v<Iter::iterator_concept, std::random_access_iterator_tag>);
     static_assert(std::is_same_v<Iter::iterator_category, std::input_iterator_tag>);
     static_assert(std::is_same_v<Iter::difference_type, std::ptrdiff_t>);
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
-    static_assert(std::is_same_v<Iter::value_type, std::pair<int, std::pair<int, int>>>);
-#else
     static_assert(std::is_same_v<Iter::value_type, std::tuple<int, std::tuple<int, int>>>);
-#endif
     static_assert(HasIterCategory<Iter>);
   }
 
@@ -169,11 +161,7 @@ void test() {
     // value_type of multiple views with different value_type
     std::ranges::zip_view v{foos, bars};
     using Iter = decltype(v.begin());
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
-    static_assert(std::is_same_v<Iter::value_type, std::pair<Foo, Bar>>);
-#else
     static_assert(std::is_same_v<Iter::value_type, std::tuple<Foo, Bar>>);
-#endif
   }
 
   {
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp
index 1538d763205db1..ba3abfa2a43695 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp
@@ -27,11 +27,7 @@ constexpr bool test() {
     assert(it[2] == *(it + 2));
     assert(it[4] == *(it + 4));
 
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
-    static_assert(std::is_same_v<decltype(it[2]), std::pair<int&, int>>);
-#else
     static_assert(std::is_same_v<decltype(it[2]), std::tuple<int&, int>>);
-#endif
   }
 
   {
@@ -42,11 +38,7 @@ constexpr bool test() {
     assert(it[2] == *(it + 2));
     assert(it[4] == *(it + 4));
 
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
-    static_assert(std::is_same_v<decltype(it[2]), std::pair<int&, int&>>);
-#else
     static_assert(std::is_same_v<decltype(it[2]), std::tuple<int&, int&>>);
-#endif
   }
 
   {

>From 658ff0b84c9dd5f33f4d769ba7378cc2c64315a1 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Wed, 16 Oct 2024 09:37:27 -0400
Subject: [PATCH 62/82] [RISCV][VLOPT] Add support for integer widening
 multiply instructions (#112204)

This adds support for these instructions and also tests getOperandInfo
for these instructions as well.
---
 llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp   |   7 +-
 llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll | 126 +++++++++++++++++++
 2 files changed, 132 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 53373b7a0f1575..6053899987db9b 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -563,7 +563,12 @@ static bool isSupportedInstr(const MachineInstr &MI) {
   case RISCV::VREM_VV:
   case RISCV::VREM_VX:
   // Vector Widening Integer Multiply Instructions
-  // FIXME: Add support
+  case RISCV::VWMUL_VV:
+  case RISCV::VWMUL_VX:
+  case RISCV::VWMULSU_VV:
+  case RISCV::VWMULSU_VX:
+  case RISCV::VWMULU_VV:
+  case RISCV::VWMULU_VX:
   // Vector Single-Width Integer Multiply-Add Instructions
   // FIXME: Add support
   // Vector Widening Integer Multiply-Add Instructions
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
index a360ae1998f77a..11f603b56b6e56 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
@@ -1122,6 +1122,132 @@ define <vscale x 4 x i32> @vrem_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
   ret <vscale x 4 x i32> %2
 }
 
+define <vscale x 4 x i64> @vwmul_vv(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmul_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; NOVLOPT-NEXT:    vwmul.vv v12, v8, v9
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwmul.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwmul_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; VLOPT-NEXT:    vwmul.vv v12, v8, v9
+; VLOPT-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vwmul.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vwmul.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vwmul.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwmul_vx(<vscale x 4 x i16> %a, i16 %b, i32 %c, iXLen %vl) {
+; NOVLOPT-LABEL: vwmul_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
+; NOVLOPT-NEXT:    vwmul.vx v12, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwmul.vx v8, v12, a1
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwmul_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; VLOPT-NEXT:    vwmul.vx v12, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vwmul.vx v8, v12, a1
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vwmul.nxv4i32.nxv4i16.i16(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a, i16 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vwmul.nxv4i64.nxv4i64.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %1, i32 %c, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwmulsu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmulsu_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwmulsu.vv v12, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwmulsu_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwmulsu.vv v12, v8, v10
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwmulsu.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwmulsu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmulsu_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwmulsu.vx v12, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwmulsu_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwmulsu.vx v12, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwmulsu.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwmulu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmulu_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwmulu.vv v12, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwmulu_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwmulu.vv v12, v8, v10
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwmulu.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwmulu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmulu_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwmulu.vx v12, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwmulu_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwmulu.vx v12, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwmulu.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
 define <vscale x 4 x i32> @vwmacc_vx(<vscale x 4 x i16> %a, i16 %b, iXLen %vl) {
 ; NOVLOPT-LABEL: vwmacc_vx:
 ; NOVLOPT:       # %bb.0:

>From 3a56b03ef33a7462f8e21ed295e59b7d851f85fa Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Wed, 16 Oct 2024 06:40:10 -0700
Subject: [PATCH 63/82] [IR] Avoid repeated hash lookups (NFC) (#112469)

---
 llvm/lib/IR/LegacyPassManager.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
index 96e2f1d7908ba6..ce6f6c733f4bcc 100644
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -104,15 +104,13 @@ void PMDataManager::emitInstrCountChangedRemark(
       [&FunctionToInstrCount](Function &MaybeChangedFn) {
         // Update the total module count.
         unsigned FnSize = MaybeChangedFn.getInstructionCount();
-        auto It = FunctionToInstrCount.find(MaybeChangedFn.getName());
 
         // If we created a new function, then we need to add it to the map and
         // say that it changed from 0 instructions to FnSize.
-        if (It == FunctionToInstrCount.end()) {
-          FunctionToInstrCount[MaybeChangedFn.getName()] =
-              std::pair<unsigned, unsigned>(0, FnSize);
+        auto [It, Inserted] = FunctionToInstrCount.try_emplace(
+            MaybeChangedFn.getName(), 0, FnSize);
+        if (Inserted)
           return;
-        }
         // Insert the new function size into the second member of the pair. This
         // tells us whether or not this function changed in size.
         It->second.second = FnSize;

>From 0a20ab908ca7cc82a4c206d39d0eaf86a46e1ff0 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Wed, 16 Oct 2024 06:40:48 -0700
Subject: [PATCH 64/82] [mlir] Avoid repeated hash lookups (NFC) (#112472)

---
 mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp b/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp
index 40c83487fd47d5..27e89d69e214d6 100644
--- a/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp
+++ b/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp
@@ -148,8 +148,9 @@ void MLProgramPipelineGlobals::processBlock(
     if (auto store = mlir::dyn_cast<GlobalStoreOp>(op)) {
       auto ref = store.getGlobal();
       symbolStore.insert(ref);
-      if (previousStores.contains(ref)) {
-        toDelete.push_back(previousStores.find(ref)->getSecond());
+      auto it = previousStores.find(ref);
+      if (it != previousStores.end()) {
+        toDelete.push_back(it->getSecond());
       }
 
       previousLoads[ref] = store.getValue();

>From 9128077c88f0112b4a5b1f64922247793250001b Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Wed, 16 Oct 2024 06:41:19 -0700
Subject: [PATCH 65/82] [Scalar] Avoid repeated hash lookups (NFC) (#112486)

---
 llvm/lib/Transforms/Scalar/Float2Int.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/Float2Int.cpp b/llvm/lib/Transforms/Scalar/Float2Int.cpp
index 98ecbe400543e1..9d23c899430095 100644
--- a/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -398,9 +398,9 @@ bool Float2IntPass::validateAndTransform(const DataLayout &DL) {
 }
 
 Value *Float2IntPass::convert(Instruction *I, Type *ToTy) {
-  if (ConvertedInsts.contains(I))
+  if (auto It = ConvertedInsts.find(I); It != ConvertedInsts.end())
     // Already converted this instruction.
-    return ConvertedInsts[I];
+    return It->second;
 
   SmallVector<Value*,4> NewOperands;
   for (Value *V : I->operands()) {

>From f5d3c87ede965d3cb4dd58aeed0a0b94e674b997 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Wed, 16 Oct 2024 06:41:40 -0700
Subject: [PATCH 66/82] [IPO] Simplify code with StringMap::operator[] (NFC)
 (#112490)

---
 llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
index 076d91adfd1dea..4e757b29961817 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
@@ -201,9 +201,7 @@ class SampleProfileMatcher {
   void UpdateWithSalvagedProfiles();
 
   LocToLocMap &getIRToProfileLocationMap(const Function &F) {
-    auto Ret = FuncMappings.try_emplace(
-        FunctionSamples::getCanonicalFnName(F.getName()), LocToLocMap());
-    return Ret.first->second;
+    return FuncMappings[FunctionSamples::getCanonicalFnName(F.getName())];
   }
   void distributeIRToProfileLocationMap();
   void distributeIRToProfileLocationMap(FunctionSamples &FS);

>From a3010c77910c706be4c51ce4a95d51211e335a1f Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Wed, 16 Oct 2024 14:43:28 +0100
Subject: [PATCH 67/82] [GlobalISel] Add boolean predicated legalization action
 methods. (#111287)

Under AArch64 it is common and will become more common to have operation
legalization rules dependant on a feature of the architecture. For
example HasFP16 or the newer CSSC integer min/max instructions, among
many others. With the current legalization rules this either means
adding a custom predicate based on the feature as in
`legalIf([=](const LegalityQuery &Query) { return HasFP16 && ...; }` or
splitting the legalization rules into pieces that place rules optionally
into them base on the features available.

This patch proposes an alternative where the existing routines like
legalFor(..) are provided a boolean predicate, which if false skips
adding the rule. It makes the rules cleaner and will hopefully allow
them to scale better as we add more features.

The SVE predicates for loads/stores I have changed to just be always
available. Scalable vectors without SVE have never been supported, but
it could also add a condition.
---
 .../llvm/CodeGen/GlobalISel/LegalizerInfo.h   |  31 ++-
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    | 181 ++++++------------
 .../GlobalISel/legalizer-info-validation.mir  |  72 +++----
 3 files changed, 119 insertions(+), 165 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 82e713f30ea31c..4e5a6cf92b761d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -599,11 +599,22 @@ class LegalizeRuleSet {
   LegalizeRuleSet &legalFor(std::initializer_list<LLT> Types) {
     return actionFor(LegalizeAction::Legal, Types);
   }
+  LegalizeRuleSet &legalFor(bool Pred, std::initializer_list<LLT> Types) {
+    if (!Pred)
+      return *this;
+    return actionFor(LegalizeAction::Legal, Types);
+  }
   /// The instruction is legal when type indexes 0 and 1 is any type pair in the
   /// given list.
   LegalizeRuleSet &legalFor(std::initializer_list<std::pair<LLT, LLT>> Types) {
     return actionFor(LegalizeAction::Legal, Types);
   }
+  LegalizeRuleSet &legalFor(bool Pred,
+                            std::initializer_list<std::pair<LLT, LLT>> Types) {
+    if (!Pred)
+      return *this;
+    return actionFor(LegalizeAction::Legal, Types);
+  }
   /// The instruction is legal when type index 0 is any type in the given list
   /// and imm index 0 is anything.
   LegalizeRuleSet &legalForTypeWithAnyImm(std::initializer_list<LLT> Types) {
@@ -846,12 +857,23 @@ class LegalizeRuleSet {
   LegalizeRuleSet &customFor(std::initializer_list<LLT> Types) {
     return actionFor(LegalizeAction::Custom, Types);
   }
+  LegalizeRuleSet &customFor(bool Pred, std::initializer_list<LLT> Types) {
+    if (!Pred)
+      return *this;
+    return actionFor(LegalizeAction::Custom, Types);
+  }
 
-  /// The instruction is custom when type indexes 0 and 1 is any type pair in the
-  /// given list.
+  /// The instruction is custom when type indexes 0 and 1 is any type pair in
+  /// the given list.
   LegalizeRuleSet &customFor(std::initializer_list<std::pair<LLT, LLT>> Types) {
     return actionFor(LegalizeAction::Custom, Types);
   }
+  LegalizeRuleSet &customFor(bool Pred,
+                             std::initializer_list<std::pair<LLT, LLT>> Types) {
+    if (!Pred)
+      return *this;
+    return actionFor(LegalizeAction::Custom, Types);
+  }
 
   LegalizeRuleSet &customForCartesianProduct(std::initializer_list<LLT> Types) {
     return actionForCartesianProduct(LegalizeAction::Custom, Types);
@@ -990,6 +1012,11 @@ class LegalizeRuleSet {
                     scalarNarrowerThan(TypeIdx, Ty.getSizeInBits()),
                     changeTo(typeIdx(TypeIdx), Ty));
   }
+  LegalizeRuleSet &minScalar(bool Pred, unsigned TypeIdx, const LLT Ty) {
+    if (!Pred)
+      return *this;
+    return minScalar(TypeIdx, Ty);
+  }
 
   /// Ensure the scalar is at least as wide as Ty if condition is met.
   LegalizeRuleSet &minScalarIf(LegalityPredicate Predicate, unsigned TypeIdx,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index a69894839361bc..773f5c0923e95c 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -215,19 +215,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .legalFor({s64, v8s16, v16s8, v4s32})
       .lower();
 
-  auto &MinMaxActions = getActionDefinitionsBuilder(
-      {G_SMIN, G_SMAX, G_UMIN, G_UMAX});
-  if (HasCSSC)
-    MinMaxActions
-        .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
-        // Making clamping conditional on CSSC extension as without legal types we
-        // lower to CMP which can fold one of the two sxtb's we'd otherwise need
-        // if we detect a type smaller than 32-bit.
-        .minScalar(0, s32);
-  else
-    MinMaxActions
-        .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32});
-  MinMaxActions
+  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+      .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
+      .legalFor(HasCSSC, {s32, s64})
+      .minScalar(HasCSSC, 0, s32)
       .clampNumElements(0, v8s8, v16s8)
       .clampNumElements(0, v4s16, v8s16)
       .clampNumElements(0, v2s32, v4s32)
@@ -247,11 +238,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       {G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FSQRT, G_FMAXNUM, G_FMINNUM,
        G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR, G_FRINT, G_FNEARBYINT,
        G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
-      .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64})
-      .legalIf([=](const LegalityQuery &Query) {
-        const auto &Ty = Query.Types[0];
-        return (Ty == v8s16 || Ty == v4s16) && HasFP16;
-      })
+      .legalFor({s32, s64, v2s32, v4s32, v2s64})
+      .legalFor(HasFP16, {s16, v4s16, v8s16})
       .libcallFor({s128})
       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
       .minScalarOrElt(0, MinFPScalar)
@@ -261,11 +249,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .moreElementsToNextPow2(0);
 
   getActionDefinitionsBuilder({G_FABS, G_FNEG})
-      .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64})
-      .legalIf([=](const LegalityQuery &Query) {
-        const auto &Ty = Query.Types[0];
-        return (Ty == v8s16 || Ty == v4s16) && HasFP16;
-      })
+      .legalFor({s32, s64, v2s32, v4s32, v2s64})
+      .legalFor(HasFP16, {s16, v4s16, v8s16})
       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
       .lowerIf(scalarOrEltWiderThan(0, 64))
       .clampNumElements(0, v4s16, v8s16)
@@ -350,31 +335,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
     return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0;
   };
 
-  auto &LoadActions = getActionDefinitionsBuilder(G_LOAD);
-  auto &StoreActions = getActionDefinitionsBuilder(G_STORE);
-
-  if (ST.hasSVE()) {
-    LoadActions.legalForTypesWithMemDesc({
-        // 128 bit base sizes
-        {nxv16s8, p0, nxv16s8, 8},
-        {nxv8s16, p0, nxv8s16, 8},
-        {nxv4s32, p0, nxv4s32, 8},
-        {nxv2s64, p0, nxv2s64, 8},
-    });
-
-    // TODO: Add nxv2p0. Consider bitcastIf.
-    //       See #92130
-    //       https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
-    StoreActions.legalForTypesWithMemDesc({
-        // 128 bit base sizes
-        {nxv16s8, p0, nxv16s8, 8},
-        {nxv8s16, p0, nxv8s16, 8},
-        {nxv4s32, p0, nxv4s32, 8},
-        {nxv2s64, p0, nxv2s64, 8},
-    });
-  }
-
-  LoadActions
+  getActionDefinitionsBuilder(G_LOAD)
       .customIf([=](const LegalityQuery &Query) {
         return HasRCPC3 && Query.Types[0] == s128 &&
                Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire;
@@ -399,6 +360,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       // These extends are also legal
       .legalForTypesWithMemDesc(
           {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
+      .legalForTypesWithMemDesc({
+          // SVE vscale x 128 bit base sizes
+          {nxv16s8, p0, nxv16s8, 8},
+          {nxv8s16, p0, nxv8s16, 8},
+          {nxv4s32, p0, nxv4s32, 8},
+          {nxv2s64, p0, nxv2s64, 8},
+      })
       .widenScalarToNextPow2(0, /* MinSize = */ 8)
       .clampMaxNumElements(0, s8, 16)
       .clampMaxNumElements(0, s16, 8)
@@ -425,7 +393,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0)
       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
 
-  StoreActions
+  getActionDefinitionsBuilder(G_STORE)
       .customIf([=](const LegalityQuery &Query) {
         return HasRCPC3 && Query.Types[0] == s128 &&
                Query.MMODescrs[0].Ordering == AtomicOrdering::Release;
@@ -445,6 +413,16 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
            {p0, p0, s64, 8},    {s128, p0, s128, 8},  {v16s8, p0, s128, 8},
            {v8s8, p0, s64, 8},  {v4s16, p0, s64, 8},  {v8s16, p0, s128, 8},
            {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}})
+      .legalForTypesWithMemDesc({
+          // SVE vscale x 128 bit base sizes
+          // TODO: Add nxv2p0. Consider bitcastIf.
+          //       See #92130
+          // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
+          {nxv16s8, p0, nxv16s8, 8},
+          {nxv8s16, p0, nxv8s16, 8},
+          {nxv4s32, p0, nxv4s32, 8},
+          {nxv2s64, p0, nxv2s64, 8},
+      })
       .clampScalar(0, s8, s64)
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].isScalar() &&
@@ -532,12 +510,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .widenScalarToNextPow2(0)
       .clampScalar(0, s8, s64);
   getActionDefinitionsBuilder(G_FCONSTANT)
-      .legalIf([=](const LegalityQuery &Query) {
-        const auto &Ty = Query.Types[0];
-        if (HasFP16 && Ty == s16)
-          return true;
-        return Ty == s32 || Ty == s64 || Ty == s128;
-      })
+      .legalFor({s32, s64, s128})
+      .legalFor(HasFP16, {s16})
       .clampScalar(0, MinFPScalar, s128);
 
   // FIXME: fix moreElementsToNextPow2
@@ -569,16 +543,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .customIf(isVector(0));
 
   getActionDefinitionsBuilder(G_FCMP)
-      .legalFor({{s32, MinFPScalar},
-                 {s32, s32},
+      .legalFor({{s32, s32},
                  {s32, s64},
                  {v4s32, v4s32},
                  {v2s32, v2s32},
                  {v2s64, v2s64}})
-      .legalIf([=](const LegalityQuery &Query) {
-        const auto &Ty = Query.Types[1];
-        return (Ty == v8s16 || Ty == v4s16) && Ty == Query.Types[0] && HasFP16;
-      })
+      .legalFor(HasFP16, {{s32, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
       .widenScalarOrEltToNextPow2(1)
       .clampScalar(0, s32, s32)
       .minScalarOrElt(1, MinFPScalar)
@@ -693,13 +663,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                  {v2s64, v2s64},
                  {v4s32, v4s32},
                  {v2s32, v2s32}})
-      .legalIf([=](const LegalityQuery &Query) {
-        return HasFP16 &&
-               (Query.Types[1] == s16 || Query.Types[1] == v4s16 ||
-                Query.Types[1] == v8s16) &&
-               (Query.Types[0] == s32 || Query.Types[0] == s64 ||
-                Query.Types[0] == v4s16 || Query.Types[0] == v8s16);
-      })
+      .legalFor(HasFP16,
+                {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
       // The range of a fp16 value fits into an i17, so we can lower the width
@@ -741,13 +706,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                  {v2s64, v2s64},
                  {v4s32, v4s32},
                  {v2s32, v2s32}})
-      .legalIf([=](const LegalityQuery &Query) {
-        return HasFP16 &&
-               (Query.Types[1] == s16 || Query.Types[1] == v4s16 ||
-                Query.Types[1] == v8s16) &&
-               (Query.Types[0] == s32 || Query.Types[0] == s64 ||
-                Query.Types[0] == v4s16 || Query.Types[0] == v8s16);
-      })
+      .legalFor(HasFP16,
+                {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
       // Handle types larger than i64 by scalarizing/lowering.
       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
@@ -790,13 +750,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                  {v2s64, v2s64},
                  {v4s32, v4s32},
                  {v2s32, v2s32}})
-      .legalIf([=](const LegalityQuery &Query) {
-        return HasFP16 &&
-               (Query.Types[0] == s16 || Query.Types[0] == v4s16 ||
-                Query.Types[0] == v8s16) &&
-               (Query.Types[1] == s32 || Query.Types[1] == s64 ||
-                Query.Types[1] == v4s16 || Query.Types[1] == v8s16);
-      })
+      .legalFor(HasFP16,
+                {{s16, s32}, {s16, s64}, {v4s16, v4s16}, {v8s16, v8s16}})
       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
       .moreElementsToNextPow2(1)
@@ -1050,12 +1005,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .widenScalarToNextPow2(1, /*Min=*/32)
       .clampScalar(1, s32, s64)
       .scalarSameSizeAs(0, 1)
-      .legalIf([=](const LegalityQuery &Query) {
-        return (HasCSSC && typeInSet(0, {s32, s64})(Query));
-      })
-      .customIf([=](const LegalityQuery &Query) {
-        return (!HasCSSC && typeInSet(0, {s32, s64})(Query));
-      });
+      .legalFor(HasCSSC, {s32, s64})
+      .customFor(!HasCSSC, {s32, s64});
 
   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
       .legalIf([=](const LegalityQuery &Query) {
@@ -1143,11 +1094,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   }
 
   // FIXME: Legal vector types are only legal with NEON.
-  auto &ABSActions = getActionDefinitionsBuilder(G_ABS);
-  if (HasCSSC)
-    ABSActions
-        .legalFor({s32, s64});
-  ABSActions.legalFor(PackedVectorAllTypeList)
+  getActionDefinitionsBuilder(G_ABS)
+      .legalFor(HasCSSC, {s32, s64})
+      .legalFor(PackedVectorAllTypeList)
       .customIf([=](const LegalityQuery &Q) {
         // TODO: Fix suboptimal codegen for 128+ bit types.
         LLT SrcTy = Q.Types[0];
@@ -1171,10 +1120,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   // later.
   getActionDefinitionsBuilder(G_VECREDUCE_FADD)
       .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}})
-      .legalIf([=](const LegalityQuery &Query) {
-        const auto &Ty = Query.Types[1];
-        return (Ty == v4s16 || Ty == v8s16) && HasFP16;
-      })
+      .legalFor(HasFP16, {{s16, v4s16}, {s16, v8s16}})
       .minScalarOrElt(0, MinFPScalar)
       .clampMaxNumElements(1, s64, 2)
       .clampMaxNumElements(1, s32, 4)
@@ -1215,10 +1161,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   getActionDefinitionsBuilder({G_VECREDUCE_FMIN, G_VECREDUCE_FMAX,
                                G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM})
       .legalFor({{s32, v4s32}, {s32, v2s32}, {s64, v2s64}})
-      .legalIf([=](const LegalityQuery &Query) {
-        const auto &Ty = Query.Types[1];
-        return Query.Types[0] == s16 && (Ty == v8s16 || Ty == v4s16) && HasFP16;
-      })
+      .legalFor(HasFP16, {{s16, v4s16}, {s16, v8s16}})
       .minScalarOrElt(0, MinFPScalar)
       .clampMaxNumElements(1, s64, 2)
       .clampMaxNumElements(1, s32, 4)
@@ -1295,32 +1238,16 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .customFor({{s32, s32}, {s64, s64}});
 
   auto always = [=](const LegalityQuery &Q) { return true; };
-  auto &CTPOPActions = getActionDefinitionsBuilder(G_CTPOP);
-  if (HasCSSC)
-    CTPOPActions
-        .legalFor({{s32, s32},
-                   {s64, s64},
-                   {v8s8, v8s8},
-                   {v16s8, v16s8}})
-        .customFor({{s128, s128},
-                    {v2s64, v2s64},
-                    {v2s32, v2s32},
-                    {v4s32, v4s32},
-                    {v4s16, v4s16},
-                    {v8s16, v8s16}});
-  else
-    CTPOPActions
-        .legalFor({{v8s8, v8s8},
-                   {v16s8, v16s8}})
-        .customFor({{s32, s32},
-                    {s64, s64},
-                    {s128, s128},
-                    {v2s64, v2s64},
-                    {v2s32, v2s32},
-                    {v4s32, v4s32},
-                    {v4s16, v4s16},
-                    {v8s16, v8s16}});
-  CTPOPActions
+  getActionDefinitionsBuilder(G_CTPOP)
+      .legalFor(HasCSSC, {{s32, s32}, {s64, s64}})
+      .legalFor({{v8s8, v8s8}, {v16s8, v16s8}})
+      .customFor(!HasCSSC, {{s32, s32}, {s64, s64}})
+      .customFor({{s128, s128},
+                  {v2s64, v2s64},
+                  {v2s32, v2s32},
+                  {v4s32, v4s32},
+                  {v4s16, v4s16},
+                  {v8s16, v8s16}})
       .clampScalar(0, s32, s128)
       .widenScalarToNextPow2(0)
       .minScalarEltSameAsIf(always, 1, 0)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index a21b786a2bae97..073c3cafa062d3 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -152,12 +152,12 @@
 #
 # DEBUG-NEXT: G_INTRINSIC_TRUNC (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_INTRINSIC_ROUND (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_INTRINSIC_LRINT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: 2, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
@@ -167,8 +167,8 @@
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_INTRINSIC_ROUNDEVEN (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_READCYCLECOUNTER (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
@@ -310,8 +310,8 @@
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FCONSTANT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_VASTART (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
@@ -459,27 +459,27 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_FADD (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FSUB (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FMUL (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FMA (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FMAD (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_FDIV (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FREM (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
@@ -565,12 +565,12 @@
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_FMINNUM (opcode {{[0-9]+}}): 1 type index
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FMAXNUM (opcode {{[0-9]+}}): 1 type index
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FMINNUM_IEEE (opcode {{[0-9]+}}): 1 type index
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
@@ -579,12 +579,12 @@
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_FMINIMUM (opcode {{[0-9]+}}): 1 type index
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FMAXIMUM (opcode {{[0-9]+}}): 1 type index
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_GET_FPENV (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
@@ -692,8 +692,8 @@
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FCEIL (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FCOS (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
@@ -734,20 +734,20 @@
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FSQRT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FFLOOR (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FRINT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FNEARBYINT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_ADDRSPACE_CAST (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined

>From 621fcf892bcd3c2d81e25c6ee39ca32db3c6b05a Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Wed, 16 Oct 2024 14:43:57 +0100
Subject: [PATCH 68/82] [mlir][OpenMP] rewrite conversion of privatisation for
 omp.parallel (#111844)

The existing conversion inlined private alloc regions and firstprivate
copy regions in mlir, then undoing the modification of the mlir module
before completing the conversion. To make this work, LLVM IR had to be
generated using the wrong mapping for privatised values and then later
fixed inside of OpenMPIRBuilder.

This approach violated an assumption in OpenMPIRBuilder that private
variables would be values not constants. Flang sometimes generates code
where private variables are promoted to globals, the address of which is
treated as a constant in LLVM IR. This caused the incorrect values for
the private variable from being replaced by OpenMPIRBuilder: ultimately
resulting in programs producing incorrect results.

This patch rewrites delayed privatisation for omp.parallel to work more
similarly to reductions: translating directly into LLVMIR with correct
mappings for private variables.

RFC:
https://discourse.llvm.org/t/rfc-openmp-fix-issue-in-mlir-to-llvmir-translation-for-delayed-privatisation/81225

Tested against the gfortran testsuite and our internal test suite.
Linaro's post-commit bots will check against the fujitsu test suite.

I decided to add the new tests as flang integration tests rather than in
mlir/test/Target/LLVMIR:
- The regression test is for an issue filed against flang. i wanted to
keep the reproducer similar to the code in the ticket.
- I found the "worst case" CFG test difficult to reason about in
abstract it helped me to think about what was going on in terms of a
Fortran program.

Fixes #106297
---
 .../parallel-private-reduction-worstcase.f90  | 262 +++++++++++
 .../Integration/OpenMP/private-global.f90     |  46 ++
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 414 +++++++-----------
 .../Target/LLVMIR/openmp-firstprivate.mlir    |  25 +-
 mlir/test/Target/LLVMIR/openmp-private.mlir   |   6 +-
 5 files changed, 490 insertions(+), 263 deletions(-)
 create mode 100644 flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
 create mode 100644 flang/test/Integration/OpenMP/private-global.f90

diff --git a/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
new file mode 100644
index 00000000000000..3aa5d042463973
--- /dev/null
+++ b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
@@ -0,0 +1,262 @@
+! RUN: %flang_fc1 -fopenmp -emit-llvm %s -o - | FileCheck %s
+
+! Combinational testing of control flow graph and builder insertion points
+! in mlir-to-llvm conversion:
+!   - mixing multiple delayed privatizations and multiple reductions
+!   - multiple blocks in the private alloc region
+!   - private alloc region has to read from the mold variable
+!   - firstprivate
+!   - multiple blocks in the private copy region
+!   - multiple blocks in the reduction init region
+!   - reduction init region has to read from the mold variable
+!   - re-used omp.private ops
+!   - re-used omp.reduction.declare ops
+!   - unstructured code inside of the parallel region
+!   - needs private dealloc region, and this has multiple blocks
+!   - needs reduction cleanup region, and this has multiple blocks
+
+! This maybe belongs in the mlir tests, but what we are doing here is complex
+! enough that I find the kind of minimised mlir code preferred by mlir reviewers
+! hard to read without some fortran here for reference. Nothing like this would
+! be generated by other upstream users of the MLIR OpenMP dialect.
+
+subroutine worst_case(a, b, c, d)
+  real, allocatable :: a(:), b(:), c(:), d(:)
+  integer i
+
+  !$omp parallel firstprivate(a,b) reduction(+:c,d)
+  if (sum(a) == 1) stop 1
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL: define internal void @worst_case_..omp_par
+! CHECK-NEXT:  omp.par.entry:
+!                [reduction alloc regions inlined here]
+! CHECK:         br label %omp.private.latealloc
+
+! CHECK:       omp.private.latealloc:                            ; preds = %omp.par.entry
+! CHECK-NEXT:  br label %omp.private.alloc5
+
+! CHECK:       omp.private.alloc5:                               ; preds = %omp.private.latealloc
+!                [begin private alloc for first var]
+!                [read the length from the mold argument]
+!                [if it is non-zero...]
+! CHECK:         br i1 {{.*}}, label %omp.private.alloc6, label %omp.private.alloc7
+
+! CHECK:       omp.private.alloc7:                               ; preds = %omp.private.alloc5
+!                [finish private alloc for first var with zero extent]
+! CHECK:         br label %omp.private.alloc8
+
+! CHECK:       omp.private.alloc8:                               ; preds = %omp.private.alloc6, %omp.private.alloc7
+! CHECK-NEXT:    br label %omp.region.cont4
+
+! CHECK:       omp.region.cont4:                                 ; preds = %omp.private.alloc8
+! CHECK-NEXT:    %{{.*}} = phi ptr
+! CHECK-NEXT:    br label %omp.private.alloc
+
+! CHECK:       omp.private.alloc:                                ; preds = %omp.region.cont4
+!                [begin private alloc for first var]
+!                [read the length from the mold argument]
+!                [if it is non-zero...]
+! CHECK:         br i1 %{{.*}}, label %omp.private.alloc1, label %omp.private.alloc2
+
+! CHECK:       omp.private.alloc2:                               ; preds = %omp.private.alloc
+!                [finish private alloc for second var with zero extent]
+! CHECK:         br label %omp.private.alloc3
+
+! CHECK:       omp.private.alloc3:                               ; preds = %omp.private.alloc1, %omp.private.alloc2
+! CHECK-NEXT:    br label %omp.region.cont
+
+! CHECK:       omp.region.cont:                                  ; preds = %omp.private.alloc3
+! CHECK-NEXT:    %{{.*}} = phi ptr
+! CHECK-NEXT:    br label %omp.private.copy
+
+! CHECK:       omp.private.copy:                                 ; preds = %omp.region.cont
+! CHECK-NEXT:    br label %omp.private.copy10
+
+! CHECK:       omp.private.copy10:                               ; preds = %omp.private.copy
+!                [begin firstprivate copy for first var]
+!                [read the length, is it non-zero?]
+! CHECK:         br i1 %{{.*}}, label %omp.private.copy11, label %omp.private.copy12
+
+! CHECK:       omp.private.copy12:                               ; preds = %omp.private.copy11, %omp.private.copy10
+! CHECK-NEXT:    br label %omp.region.cont9
+
+! CHECK:       omp.region.cont9:                                 ; preds = %omp.private.copy12
+! CHECK-NEXT:    %{{.*}} = phi ptr
+! CHECK-NEXT:    br label %omp.private.copy14
+
+! CHECK:       omp.private.copy14:                               ; preds = %omp.region.cont9
+!                [begin firstprivate copy for second var]
+!                [read the length, is it non-zero?]
+! CHECK:         br i1 %{{.*}}, label %omp.private.copy15, label %omp.private.copy16
+
+! CHECK:       omp.private.copy16:                               ; preds = %omp.private.copy15, %omp.private.copy14
+! CHECK-NEXT:    br label %omp.region.cont13
+
+! CHECK:       omp.region.cont13:                                ; preds = %omp.private.copy16
+! CHECK-NEXT:    %{{.*}} = phi ptr
+! CHECK-NEXT:    br label %omp.reduction.init
+
+! CHECK:       omp.reduction.init:                               ; preds = %omp.region.cont13
+!                [deffered stores for results of reduction alloc regions]
+! CHECK:         br label %[[VAL_96:.*]]
+
+! CHECK:       omp.reduction.neutral:                            ; preds = %omp.reduction.init
+!                [start of reduction initialization region]
+!                [null check:]
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.neutral18, label %omp.reduction.neutral19
+
+! CHECK:       omp.reduction.neutral19:                          ; preds = %omp.reduction.neutral
+!                [malloc and assign the default value to the reduction variable]
+! CHECK:         br label %omp.reduction.neutral20
+
+! CHECK:       omp.reduction.neutral20:                          ; preds = %omp.reduction.neutral18, %omp.reduction.neutral19
+! CHECK-NEXT:    br label %omp.region.cont17
+
+! CHECK:       omp.region.cont17:                                ; preds = %omp.reduction.neutral20
+! CHECK-NEXT:    %{{.*}} = phi ptr
+! CHECK-NEXT:    br label %omp.reduction.neutral22
+
+! CHECK:       omp.reduction.neutral22:                          ; preds = %omp.region.cont17
+!                [start of reduction initialization region]
+!                [null check:]
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.neutral23, label %omp.reduction.neutral24
+
+! CHECK:       omp.reduction.neutral24:                          ; preds = %omp.reduction.neutral22
+!                [malloc and assign the default value to the reduction variable]
+! CHECK:         br label %omp.reduction.neutral25
+
+! CHECK:       omp.reduction.neutral25:                          ; preds = %omp.reduction.neutral23, %omp.reduction.neutral24
+! CHECK-NEXT:    br label %omp.region.cont21
+
+! CHECK:       omp.region.cont21:                                ; preds = %omp.reduction.neutral25
+! CHECK-NEXT:    %{{.*}} = phi ptr
+! CHECK-NEXT:    br label %omp.par.region
+
+! CHECK:       omp.par.region:                                   ; preds = %omp.region.cont21
+! CHECK-NEXT:    br label %omp.par.region27
+
+! CHECK:       omp.par.region27:                                 ; preds = %omp.par.region
+!                [call SUM runtime function]
+!                [if (sum(a) == 1)]
+! CHECK:         br i1 %{{.*}}, label %omp.par.region28, label %omp.par.region29
+
+! CHECK:       omp.par.region29:                                 ; preds = %omp.par.region27
+! CHECK-NEXT:    br label %omp.region.cont26
+
+! CHECK:       omp.region.cont26:                                ; preds = %omp.par.region28, %omp.par.region29
+!                [omp parallel region done, call into the runtime to complete reduction]
+! CHECK:         %[[VAL_233:.*]] = call i32 @__kmpc_reduce(
+! CHECK:         switch i32 %[[VAL_233]], label %reduce.finalize [
+! CHECK-NEXT:      i32 1, label %reduce.switch.nonatomic
+! CHECK-NEXT:      i32 2, label %reduce.switch.atomic
+! CHECK-NEXT:    ]
+
+! CHECK:       reduce.switch.atomic:                             ; preds = %omp.region.cont26
+! CHECK-NEXT:    unreachable
+
+! CHECK:       reduce.switch.nonatomic:                          ; preds = %omp.region.cont26
+! CHECK-NEXT:    %[[red_private_value_0:.*]] = load ptr, ptr %{{.*}}, align 8
+! CHECK-NEXT:    br label %omp.reduction.nonatomic.body
+
+!              [various blocks implementing the reduction]
+
+! CHECK:       omp.region.cont35:                                ; preds =
+! CHECK-NEXT:    %{{.*}} = phi ptr
+! CHECK-NEXT:    call void @__kmpc_end_reduce(
+! CHECK-NEXT:    br label %reduce.finalize
+
+! CHECK:       reduce.finalize:                                  ; preds =
+! CHECK-NEXT:    br label %omp.par.pre_finalize
+
+! CHECK:       omp.par.pre_finalize:                             ; preds = %reduce.finalize
+! CHECK-NEXT:    %{{.*}} = load ptr, ptr
+! CHECK-NEXT:    br label %omp.reduction.cleanup
+
+! CHECK:       omp.reduction.cleanup:                            ; preds = %omp.par.pre_finalize
+!                [null check]
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.cleanup41, label %omp.reduction.cleanup42
+
+! CHECK:       omp.reduction.cleanup42:                          ; preds = %omp.reduction.cleanup41, %omp.reduction.cleanup
+! CHECK-NEXT:    br label %omp.region.cont40
+
+! CHECK:       omp.region.cont40:                                ; preds = %omp.reduction.cleanup42
+! CHECK-NEXT:    %{{.*}} = load ptr, ptr
+! CHECK-NEXT:    br label %omp.reduction.cleanup44
+
+! CHECK:       omp.reduction.cleanup44:                          ; preds = %omp.region.cont40
+!                [null check]
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.cleanup45, label %omp.reduction.cleanup46
+
+! CHECK:       omp.reduction.cleanup46:                          ; preds = %omp.reduction.cleanup45, %omp.reduction.cleanup44
+! CHECK-NEXT:    br label %omp.region.cont43
+
+! CHECK:       omp.region.cont43:                                ; preds = %omp.reduction.cleanup46
+! CHECK-NEXT:    br label %omp.private.dealloc
+
+! CHECK:       omp.private.dealloc:                              ; preds = %omp.region.cont43
+!                [null check]
+! CHECK:         br i1 %{{.*}}, label %omp.private.dealloc48, label %omp.private.dealloc49
+
+! CHECK:       omp.private.dealloc49:                            ; preds = %omp.private.dealloc48, %omp.private.dealloc
+! CHECK-NEXT:    br label %omp.region.cont47
+
+! CHECK:       omp.region.cont47:                                ; preds = %omp.private.dealloc49
+! CHECK-NEXT:    br label %omp.private.dealloc51
+
+! CHECK:       omp.private.dealloc51:                            ; preds = %omp.region.cont47
+!                [null check]
+! CHECK:         br i1 %{{.*}}, label %omp.private.dealloc52, label %omp.private.dealloc53
+
+! CHECK:       omp.private.dealloc53:                            ; preds = %omp.private.dealloc52, %omp.private.dealloc51
+! CHECK-NEXT:    br label %omp.region.cont50
+
+! CHECK:       omp.region.cont50:                                ; preds = %omp.private.dealloc53
+! CHECK-NEXT:    br label %omp.par.outlined.exit.exitStub
+
+! CHECK:       omp.private.dealloc52:                            ; preds = %omp.private.dealloc51
+!                [dealloc memory]
+! CHECK:         br label %omp.private.dealloc53
+
+! CHECK:       omp.private.dealloc48:                            ; preds = %omp.private.dealloc
+!                [dealloc memory]
+! CHECK:         br label %omp.private.dealloc49
+
+! CHECK:       omp.reduction.cleanup45:                          ; preds = %omp.reduction.cleanup44
+! CHECK-NEXT:    call void @free(
+! CHECK-NEXT:    br label %omp.reduction.cleanup46
+
+! CHECK:       omp.reduction.cleanup41:                          ; preds = %omp.reduction.cleanup
+! CHECK-NEXT:    call void @free(
+! CHECK-NEXT:    br label %omp.reduction.cleanup42
+
+! CHECK:       omp.par.region28:                                 ; preds = %omp.par.region27
+! CHECK-NEXT:    call {} @_FortranAStopStatement
+
+! CHECK:       omp.reduction.neutral23:                          ; preds = %omp.reduction.neutral22
+!                [source length was zero: finish initializing array]
+! CHECK:         br label %omp.reduction.neutral25
+
+! CHECK:       omp.reduction.neutral18:                          ; preds = %omp.reduction.neutral
+!                [source length was zero: finish initializing array]
+! CHECK:         br label %omp.reduction.neutral20
+
+! CHECK:       omp.private.copy15:                               ; preds = %omp.private.copy14
+!                [source length was non-zero: call assign runtime]
+! CHECK:         br label %omp.private.copy16
+
+! CHECK:       omp.private.copy11:                               ; preds = %omp.private.copy10
+!                [source length was non-zero: call assign runtime]
+! CHECK:         br label %omp.private.copy12
+
+! CHECK:       omp.private.alloc1:                               ; preds = %omp.private.alloc
+!                [var extent was non-zero: malloc a private array]
+! CHECK:         br label %omp.private.alloc3
+
+! CHECK:       omp.private.alloc6:                               ; preds = %omp.private.alloc5
+!                [var extent was non-zero: malloc a private array]
+! CHECK:         br label %omp.private.alloc8
+
+! CHECK:       omp.par.outlined.exit.exitStub:                   ; preds = %omp.region.cont50
+! CHECK-NEXT:    ret void
diff --git a/flang/test/Integration/OpenMP/private-global.f90 b/flang/test/Integration/OpenMP/private-global.f90
new file mode 100644
index 00000000000000..62d0a3faf0c593
--- /dev/null
+++ b/flang/test/Integration/OpenMP/private-global.f90
@@ -0,0 +1,46 @@
+!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s
+
+! Regression test for https://github.com/llvm/llvm-project/issues/106297
+
+program bug
+  implicit none
+  integer :: table(10)
+  !$OMP PARALLEL PRIVATE(table)
+    table = 50
+    if (any(table/=50)) then
+      stop 'fail 3'
+    end if
+  !$OMP END PARALLEL
+  print *,'ok'
+End Program
+
+
+! CHECK-LABEL: define internal void {{.*}}..omp_par(
+! CHECK:       omp.par.entry:
+! CHECK:         %[[VAL_9:.*]] = alloca i32, align 4
+! CHECK:         %[[VAL_10:.*]] = load i32, ptr %[[VAL_11:.*]], align 4
+! CHECK:         store i32 %[[VAL_10]], ptr %[[VAL_9]], align 4
+! CHECK:         %[[VAL_12:.*]] = load i32, ptr %[[VAL_9]], align 4
+! CHECK:         %[[PRIV_TABLE:.*]] = alloca [10 x i32], i64 1, align 4
+! ...
+! check that we use the private copy of table for the assignment
+! CHECK:       omp.par.region1:
+! CHECK:         %[[ELEMENTAL_TMP:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
+! CHECK:         %[[TABLE_BOX_ADDR:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
+! CHECK:         %[[BOXED_FIFTY:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+! CHECK:         %[[TABLE_BOX_ADDR2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
+! CHECK:         %[[TABLE_BOX_VAL:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]] {{\[\[}}3 x i64] [i64 1, i64 10, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64)]] }, ptr %[[PRIV_TABLE]], 0
+! CHECK:         store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL]], ptr %[[TABLE_BOX_ADDR]], align 8
+! CHECK:         %[[TABLE_BOX_VAL2:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[TABLE_BOX_ADDR]], align 8
+! CHECK:         store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL2]], ptr %[[TABLE_BOX_ADDR2]], align 8
+! CHECK:         %[[VAL_26:.*]] = call {} @_FortranAAssign(ptr %[[TABLE_BOX_ADDR2]], ptr %[[BOXED_FIFTY]], ptr @{{.*}}, i32 9)
+! ...
+! check that we use the private copy of table for table/=50
+! CHECK:       omp.par.region3:
+! CHECK:         %[[VAL_44:.*]] = sub nsw i64 %{{.*}}, 1
+! CHECK:         %[[VAL_45:.*]] = mul nsw i64 %[[VAL_44]], 1
+! CHECK:         %[[VAL_46:.*]] = mul nsw i64 %[[VAL_45]], 1
+! CHECK:         %[[VAL_47:.*]] = add nsw i64 %[[VAL_46]], 0
+! CHECK:         %[[VAL_48:.*]] = getelementptr i32, ptr %[[PRIV_TABLE]], i64 %[[VAL_47]]
+! CHECK:         %[[VAL_49:.*]] = load i32, ptr %[[VAL_48]], align 4
+! CHECK:         %[[VAL_50:.*]] = icmp ne i32 %[[VAL_49]], 50
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index cb7dd3cd874d9f..7c45e89cd8ac4b 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -371,20 +371,46 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
-/// Populates `reductions` with reduction declarations used in the given loop.
+/// Looks up from the operation from and returns the PrivateClauseOp with
+/// name symbolName
+static omp::PrivateClauseOp findPrivatizer(Operation *from,
+                                           SymbolRefAttr symbolName) {
+  omp::PrivateClauseOp privatizer =
+      SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(from,
+                                                                 symbolName);
+  assert(privatizer && "privatizer not found in the symbol table");
+  return privatizer;
+}
+
+/// Populates `privatizations` with privatization declarations used for the
+/// given op.
+/// TODO: generalise beyond ParallelOp
+static void collectPrivatizationDecls(
+    omp::ParallelOp op, SmallVectorImpl<omp::PrivateClauseOp> &privatizations) {
+  std::optional<ArrayAttr> attr = op.getPrivateSyms();
+  if (!attr)
+    return;
+
+  privatizations.reserve(privatizations.size() + attr->size());
+  for (auto symbolRef : attr->getAsRange<SymbolRefAttr>()) {
+    privatizations.push_back(findPrivatizer(op, symbolRef));
+  }
+}
+
+/// Populates `reductions` with reduction declarations used in the given op.
 template <typename T>
 static void
-collectReductionDecls(T loop,
+collectReductionDecls(T op,
                       SmallVectorImpl<omp::DeclareReductionOp> &reductions) {
-  std::optional<ArrayAttr> attr = loop.getReductionSyms();
+  std::optional<ArrayAttr> attr = op.getReductionSyms();
   if (!attr)
     return;
 
-  reductions.reserve(reductions.size() + loop.getNumReductionVars());
+  reductions.reserve(reductions.size() + op.getNumReductionVars());
   for (auto symbolRef : attr->getAsRange<SymbolRefAttr>()) {
     reductions.push_back(
         SymbolTable::lookupNearestSymbolFrom<omp::DeclareReductionOp>(
-            loop, symbolRef));
+            op, symbolRef));
   }
 }
 
@@ -609,7 +635,7 @@ static LogicalResult
 allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
                    llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation,
-                   llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
+                   const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
                    SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
                    SmallVectorImpl<llvm::Value *> &privateReductionVariables,
                    DenseMap<Value, llvm::Value *> &reductionVariableMap,
@@ -1317,76 +1343,11 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
                                     privateReductionVariables, isByRef);
 }
 
-/// A RAII class that on construction replaces the region arguments of the
-/// parallel op (which correspond to private variables) with the actual private
-/// variables they correspond to. This prepares the parallel op so that it
-/// matches what is expected by the OMPIRBuilder.
-///
-/// On destruction, it restores the original state of the operation so that on
-/// the MLIR side, the op is not affected by conversion to LLVM IR.
-class OmpParallelOpConversionManager {
-public:
-  OmpParallelOpConversionManager(omp::ParallelOp opInst)
-      : region(opInst.getRegion()),
-        privateBlockArgs(cast<omp::BlockArgOpenMPOpInterface>(*opInst)
-                             .getPrivateBlockArgs()),
-        privateVars(opInst.getPrivateVars()) {
-    for (auto [blockArg, var] : llvm::zip_equal(privateBlockArgs, privateVars))
-      mlir::replaceAllUsesInRegionWith(blockArg, var, region);
-  }
-
-  ~OmpParallelOpConversionManager() {
-    for (auto [blockArg, var] : llvm::zip_equal(privateBlockArgs, privateVars))
-      mlir::replaceAllUsesInRegionWith(var, blockArg, region);
-  }
-
-private:
-  Region ®ion;
-  llvm::MutableArrayRef<BlockArgument> privateBlockArgs;
-  OperandRange privateVars;
-};
-
-// Looks up from the operation from and returns the PrivateClauseOp with
-// name symbolName
-static omp::PrivateClauseOp findPrivatizer(Operation *from,
-                                           SymbolRefAttr symbolName) {
-  omp::PrivateClauseOp privatizer =
-      SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(from,
-                                                                 symbolName);
-  assert(privatizer && "privatizer not found in the symbol table");
-  return privatizer;
-}
-// clones the given privatizer. The original privatizer is used as
-// the insert point for the clone.
-static omp::PrivateClauseOp
-clonePrivatizer(LLVM::ModuleTranslation &moduleTranslation,
-                omp::PrivateClauseOp privatizer, Operation *fromOperation) {
-  MLIRContext &context = moduleTranslation.getContext();
-  mlir::IRRewriter opCloner(&context);
-  opCloner.setInsertionPoint(privatizer);
-  auto clone =
-      llvm::cast<mlir::omp::PrivateClauseOp>(opCloner.clone(*privatizer));
-
-  // Unique the clone name to avoid clashes in the symbol table.
-  unsigned counter = 0;
-  SmallString<256> cloneName = SymbolTable::generateSymbolName<256>(
-      privatizer.getSymName(),
-      [&](llvm::StringRef candidate) {
-        return SymbolTable::lookupNearestSymbolFrom(
-                   fromOperation, StringAttr::get(&context, candidate)) !=
-               nullptr;
-      },
-      counter);
-
-  clone.setSymName(cloneName);
-  return clone;
-}
 /// Converts the OpenMP parallel operation to LLVM IR.
 static LogicalResult
 convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) {
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
-  OmpParallelOpConversionManager raii(opInst);
   ArrayRef<bool> isByRef = getIsByRef(opInst.getReductionByref());
   assert(isByRef.size() == opInst.getNumReductionVars());
 
@@ -1395,6 +1356,15 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   LogicalResult bodyGenStatus = success();
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
 
+  // Collect delayed privatization declarations
+  MutableArrayRef<BlockArgument> privateBlockArgs =
+      cast<omp::BlockArgOpenMPOpInterface>(*opInst).getPrivateBlockArgs();
+  SmallVector<llvm::Value *> llvmPrivateVars;
+  SmallVector<omp::PrivateClauseOp> privateDecls;
+  llvmPrivateVars.reserve(privateBlockArgs.size());
+  privateDecls.reserve(privateBlockArgs.size());
+  collectPrivatizationDecls(opInst, privateDecls);
+
   // Collect reduction declarations
   SmallVector<omp::DeclareReductionOp> reductionDecls;
   collectReductionDecls(opInst, reductionDecls);
@@ -1403,6 +1373,66 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   SmallVector<DeferredStore> deferredStores;
 
   auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
+    // Allocate private vars
+    llvm::BranchInst *allocaTerminator =
+        llvm::cast<llvm::BranchInst>(allocaIP.getBlock()->getTerminator());
+    builder.SetInsertPoint(allocaTerminator);
+    assert(allocaTerminator->getNumSuccessors() == 1 &&
+           "This is an unconditional branch created by OpenMPIRBuilder");
+    llvm::BasicBlock *afterAllocas = allocaTerminator->getSuccessor(0);
+
+    // FIXME: Some of the allocation regions do more than just allocating.
+    // They read from their block argument (amongst other non-alloca things).
+    // When OpenMPIRBuilder outlines the parallel region into a different
+    // function it places the loads for live in-values (such as these block
+    // arguments) at the end of the entry block (because the entry block is
+    // assumed to contain only allocas). Therefore, if we put these complicated
+    // alloc blocks in the entry block, these will not dominate the availability
+    // of the live-in values they are using. Fix this by adding a latealloc
+    // block after the entry block to put these in (this also helps to avoid
+    // mixing non-alloca code with allocas).
+    // Alloc regions which do not use the block argument can still be placed in
+    // the entry block (therefore keeping the allocas together).
+    llvm::BasicBlock *privAllocBlock = nullptr;
+    if (!privateBlockArgs.empty())
+      privAllocBlock = splitBB(builder, true, "omp.private.latealloc");
+    for (unsigned i = 0; i < privateBlockArgs.size(); ++i) {
+      Region &allocRegion = privateDecls[i].getAllocRegion();
+
+      // map allocation region block argument
+      llvm::Value *nonPrivateVar =
+          moduleTranslation.lookupValue(opInst.getPrivateVars()[i]);
+      assert(nonPrivateVar);
+      moduleTranslation.mapValue(privateDecls[i].getAllocMoldArg(),
+                                 nonPrivateVar);
+
+      // in-place convert the private allocation region
+      SmallVector<llvm::Value *, 1> phis;
+      if (privateDecls[i].getAllocMoldArg().getUses().empty()) {
+        // TODO this should use
+        // allocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca() so it goes before
+        // the code for fetching the thread id. Not doing this for now to avoid
+        // test churn.
+        builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
+      } else {
+        builder.SetInsertPoint(privAllocBlock->getTerminator());
+      }
+      if (failed(inlineConvertOmpRegions(allocRegion, "omp.private.alloc",
+                                         builder, moduleTranslation, &phis))) {
+        bodyGenStatus = failure();
+        return;
+      }
+      assert(phis.size() == 1 && "expected one allocation to be yielded");
+
+      moduleTranslation.mapValue(privateBlockArgs[i], phis[0]);
+      llvmPrivateVars.push_back(phis[0]);
+
+      // clear alloc region block argument mapping in case it needs to be
+      // re-created with a different source for another use of the same
+      // reduction decl
+      moduleTranslation.forgetMapping(allocRegion);
+    }
+
     // Allocate reduction vars
     DenseMap<Value, llvm::Value *> reductionVariableMap;
 
@@ -1419,12 +1449,64 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
             deferredStores, isByRef)))
       bodyGenStatus = failure();
 
+    // Apply copy region for firstprivate.
+    bool needsFirstprivate =
+        llvm::any_of(privateDecls, [](omp::PrivateClauseOp &privOp) {
+          return privOp.getDataSharingType() ==
+                 omp::DataSharingClauseType::FirstPrivate;
+        });
+    if (needsFirstprivate) {
+      // Find the end of the allocation blocks
+      assert(afterAllocas->getSinglePredecessor());
+      builder.SetInsertPoint(
+          afterAllocas->getSinglePredecessor()->getTerminator());
+      llvm::BasicBlock *copyBlock =
+          splitBB(builder, /*CreateBranch=*/true, "omp.private.copy");
+      builder.SetInsertPoint(copyBlock->getFirstNonPHIOrDbgOrAlloca());
+    }
+    for (unsigned i = 0; i < privateBlockArgs.size(); ++i) {
+      if (privateDecls[i].getDataSharingType() !=
+          omp::DataSharingClauseType::FirstPrivate)
+        continue;
+
+      // copyRegion implements `lhs = rhs`
+      Region &copyRegion = privateDecls[i].getCopyRegion();
+
+      // map copyRegion rhs arg
+      llvm::Value *nonPrivateVar =
+          moduleTranslation.lookupValue(opInst.getPrivateVars()[i]);
+      assert(nonPrivateVar);
+      moduleTranslation.mapValue(privateDecls[i].getCopyMoldArg(),
+                                 nonPrivateVar);
+
+      // map copyRegion lhs arg
+      moduleTranslation.mapValue(privateDecls[i].getCopyPrivateArg(),
+                                 llvmPrivateVars[i]);
+
+      // in-place convert copy region
+      builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
+      if (failed(inlineConvertOmpRegions(copyRegion, "omp.private.copy",
+                                         builder, moduleTranslation))) {
+        bodyGenStatus = failure();
+        return;
+      }
+
+      // ignore unused value yielded from copy region
+
+      // clear copy region block argument mapping in case it needs to be
+      // re-created with different sources for reuse of the same reduction
+      // decl
+      moduleTranslation.forgetMapping(copyRegion);
+    }
+
     // Initialize reduction vars
-    builder.restoreIP(allocaIP);
+    builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
     llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init");
     allocaIP =
         InsertPointTy(allocaIP.getBlock(),
                       allocaIP.getBlock()->getTerminator()->getIterator());
+
+    builder.restoreIP(allocaIP);
     SmallVector<llvm::Value *> byRefVars(opInst.getNumReductionVars());
     for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
       if (isByRef[i]) {
@@ -1534,183 +1616,11 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     }
   };
 
-  SmallVector<omp::PrivateClauseOp> mlirPrivatizerClones;
-  SmallVector<llvm::Value *> llvmPrivateVars;
-
-  // TODO: Perform appropriate actions according to the data-sharing
-  // attribute (shared, private, firstprivate, ...) of variables.
-  // Currently shared and private are supported.
-  auto privCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP,
-                    llvm::Value &, llvm::Value &llvmOmpRegionInput,
-                    llvm::Value *&llvmReplacementValue) -> InsertPointTy {
-    llvmReplacementValue = &llvmOmpRegionInput;
-
-    // If this is a private value, this lambda will return the corresponding
-    // mlir value and its `PrivateClauseOp`. Otherwise, empty values are
-    // returned.
-    auto [mlirPrivVar, mlirPrivatizerClone] =
-        [&]() -> std::pair<mlir::Value, omp::PrivateClauseOp> {
-      if (!opInst.getPrivateVars().empty()) {
-        auto mlirPrivVars = opInst.getPrivateVars();
-        auto mlirPrivSyms = opInst.getPrivateSyms();
-
-        // Try to find a privatizer that corresponds to the LLVM value being
-        // privatized.
-        for (auto [mlirPrivVar, mlirPrivatizerAttr] :
-             llvm::zip_equal(mlirPrivVars, *mlirPrivSyms)) {
-          // Find the MLIR private variable corresponding to the LLVM value
-          // being privatized.
-          llvm::Value *mlirToLLVMPrivVar =
-              moduleTranslation.lookupValue(mlirPrivVar);
-
-          // Check if the LLVM value being privatized matches the LLVM value
-          // mapped to privVar. In some cases, this is not trivial ...
-          auto isMatch = [&]() {
-            if (mlirToLLVMPrivVar == nullptr)
-              return false;
-
-            // If both values are trivially equal, we found a match.
-            if (mlirToLLVMPrivVar == &llvmOmpRegionInput)
-              return true;
-
-            // Otherwise, we check if both llvmOmpRegionInputPtr and
-            // mlirToLLVMPrivVar refer to the same memory (through a load/store
-            // pair). This happens if a struct (i.e. multi-field value) is being
-            // privatized.
-            //
-            // For example, if the privatized value is defined by:
-            // ```
-            //   %priv_val = alloca { ptr, i64 }, align 8
-            // ```
-            //
-            // The initialization of this value (outside the omp region) will be
-            // something like this:
-            //
-            // clang-format off
-            // ```
-            //   %partially_init_priv_val = insertvalue { ptr, i64 } undef,
-            //                              ptr %some_ptr, 0
-            //   %fully_init_priv_val = insertvalue { ptr, i64 } %partially_init_priv_val,
-            //                          i64 %some_i64, 1
-            //   ...
-            //   store { ptr, i64 } %fully_init_priv_val, ptr %priv_val, align 8
-            // ```
-            // clang-format on
-            //
-            // Now, we enter the OMP region, in order to access this privatized
-            // value, we need to load from the allocated memory:
-            // ```
-            // omp.par.entry:
-            //   %priv_val_load = load { ptr, i64 }, ptr %priv_val, align 8
-            // ```
-            //
-            // The 2 LLVM values tracked here map as follows:
-            // - `mlirToLLVMPrivVar`     -> `%fully_init_priv_val`
-            // - `llvmOmpRegionInputPtr` -> `%priv_val_load`
-            //
-            // Even though they eventually refer to the same memory reference
-            // (the memory being privatized), they are 2 distinct LLVM values.
-            // Therefore, we need to discover their correspondence by finding
-            // out if they store into and load from the same mem ref.
-            auto *llvmOmpRegionInputPtrLoad =
-                llvm::dyn_cast_if_present<llvm::LoadInst>(&llvmOmpRegionInput);
-
-            if (llvmOmpRegionInputPtrLoad == nullptr)
-              return false;
-
-            for (auto &use : mlirToLLVMPrivVar->uses()) {
-              auto *mlirToLLVMPrivVarStore =
-                  llvm::dyn_cast_if_present<llvm::StoreInst>(use.getUser());
-              if (mlirToLLVMPrivVarStore &&
-                  (llvmOmpRegionInputPtrLoad->getPointerOperand() ==
-                   mlirToLLVMPrivVarStore->getPointerOperand()))
-                return true;
-            }
-
-            return false;
-          };
-
-          if (!isMatch())
-            continue;
-
-          SymbolRefAttr privSym = llvm::cast<SymbolRefAttr>(mlirPrivatizerAttr);
-          omp::PrivateClauseOp privatizer = findPrivatizer(opInst, privSym);
-
-          // Clone the privatizer in case it is used by more than one parallel
-          // region. The privatizer is processed in-place (see below) before it
-          // gets inlined in the parallel region and therefore processing the
-          // original op is dangerous.
-          return {mlirPrivVar,
-                  clonePrivatizer(moduleTranslation, privatizer, opInst)};
-        }
-      }
-
-      return {mlir::Value(), omp::PrivateClauseOp()};
-    }();
-
-    if (mlirPrivVar) {
-      Region &allocRegion = mlirPrivatizerClone.getAllocRegion();
-
-      // If this is a `firstprivate` clause, prepare the `omp.private` op by:
-      if (mlirPrivatizerClone.getDataSharingType() ==
-          omp::DataSharingClauseType::FirstPrivate) {
-        auto oldAllocBackBlock = std::prev(allocRegion.end());
-        omp::YieldOp oldAllocYieldOp =
-            llvm::cast<omp::YieldOp>(oldAllocBackBlock->getTerminator());
-
-        Region &copyRegion = mlirPrivatizerClone.getCopyRegion();
-
-        mlir::IRRewriter copyCloneBuilder(&moduleTranslation.getContext());
-        // 1. Cloning the `copy` region to the end of the `alloc` region.
-        copyCloneBuilder.cloneRegionBefore(copyRegion, allocRegion,
-                                           allocRegion.end());
-
-        auto newCopyRegionFrontBlock = std::next(oldAllocBackBlock);
-        // 2. Merging the last `alloc` block with the first block in the `copy`
-        // region clone.
-        // 3. Re-mapping the first argument of the `copy` region to be the
-        // argument of the `alloc` region and the second argument of the `copy`
-        // region to be the yielded value of the `alloc` region (this is the
-        // private clone of the privatized value).
-        copyCloneBuilder.mergeBlocks(&*newCopyRegionFrontBlock,
-                                     &*oldAllocBackBlock,
-                                     {mlirPrivatizerClone.getAllocMoldArg(),
-                                      oldAllocYieldOp.getOperand(0)});
-
-        // 4. The old terminator of the `alloc` region is not needed anymore, so
-        // delete it.
-        oldAllocYieldOp.erase();
-      }
-
-      // Replace the privatizer block argument with mlir value being privatized.
-      // This way, the body of the privatizer will be changed from using the
-      // region/block argument to the value being privatized.
-      replaceAllUsesInRegionWith(mlirPrivatizerClone.getAllocMoldArg(),
-                                 mlirPrivVar, allocRegion);
-
-      auto oldIP = builder.saveIP();
-      builder.restoreIP(allocaIP);
-
-      SmallVector<llvm::Value *, 1> yieldedValues;
-      if (failed(inlineConvertOmpRegions(allocRegion, "omp.privatizer", builder,
-                                         moduleTranslation, &yieldedValues))) {
-        opInst.emitError("failed to inline `alloc` region of an `omp.private` "
-                         "op in the parallel region");
-        bodyGenStatus = failure();
-        mlirPrivatizerClone.erase();
-      } else {
-        assert(yieldedValues.size() == 1);
-        llvmReplacementValue = yieldedValues.front();
-
-        // Keep the LLVM replacement value and the op clone in case we need to
-        // emit cleanup (i.e. deallocation) logic.
-        llvmPrivateVars.push_back(llvmReplacementValue);
-        mlirPrivatizerClones.push_back(mlirPrivatizerClone);
-      }
-
-      builder.restoreIP(oldIP);
-    }
-
+  auto privCB = [](InsertPointTy allocaIP, InsertPointTy codeGenIP,
+                   llvm::Value &, llvm::Value &val, llvm::Value *&replVal) {
+    // tell OpenMPIRBuilder not to do anything. We handled Privatisation in
+    // bodyGenCB.
+    replVal = &val;
     return codeGenIP;
   };
 
@@ -1733,8 +1643,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       bodyGenStatus = failure();
 
     SmallVector<Region *> privateCleanupRegions;
-    llvm::transform(mlirPrivatizerClones,
-                    std::back_inserter(privateCleanupRegions),
+    llvm::transform(privateDecls, std::back_inserter(privateCleanupRegions),
                     [](omp::PrivateClauseOp privatizer) {
                       return &privatizer.getDeallocRegion();
                     });
@@ -1767,9 +1676,6 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       ompBuilder->createParallel(ompLoc, allocaIP, bodyGenCB, privCB, finiCB,
                                  ifCond, numThreads, pbKind, isCancellable));
 
-  for (mlir::omp::PrivateClauseOp privatizerClone : mlirPrivatizerClones)
-    privatizerClone.erase();
-
   return bodyGenStatus;
 }
 
diff --git a/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir b/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir
index 02ce6b5b19ceaf..79412fb69f7583 100644
--- a/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir
@@ -74,27 +74,38 @@ llvm.func @parallel_op_firstprivate_multi_block(%arg0: !llvm.ptr) {
 // CHECK: [[PRIV_BB2]]:
 // CHECK-NEXT: %[[C1:.*]] = phi i32 [ 1, %[[PRIV_BB1]] ]
 // CHECK-NEXT: %[[PRIV_ALLOC:.*]] = alloca float, i32 %[[C1]], align 4
-// The entry block of the `copy` region is merged into the exit block of the
-// `alloc` region. So check for that.
+// CHECK-NEXT: br label %omp.region.cont
+
+// CHECK: omp.region.cont:
+// CHECK-NEXT: %[[PRIV_ALLOC2:.*]] = phi ptr [ %[[PRIV_ALLOC]], %[[PRIV_BB2]] ]
+// CHECK-NEXT: br label %omp.private.latealloc
+
+// CHECK: omp.private.latealloc:
+// CHECK-NEXT: br label %omp.private.copy
+
+// CHECK: omp.private.copy:
+// CHECK-NEXT: br label %omp.private.copy3
+
+// CHECK: omp.private.copy3:
 // CHECK-NEXT: %[[ORIG_VAL:.*]] = load float, ptr %[[ORIG_PTR]], align 4
 // CHECK-NEXT: br label %[[PRIV_BB3:.*]]
 
 // Check contents of the 2nd block in the `copy` region.
 // CHECK: [[PRIV_BB3]]:
-// CHECK-NEXT: %[[ORIG_VAL2:.*]] = phi float [ %[[ORIG_VAL]], %[[PRIV_BB2]] ]
-// CHECK-NEXT: %[[PRIV_ALLOC2:.*]] = phi ptr [ %[[PRIV_ALLOC]], %[[PRIV_BB2]] ]
-// CHECK-NEXT: store float %[[ORIG_VAL2]], ptr %[[PRIV_ALLOC2]], align 4
+// CHECK-NEXT: %[[ORIG_VAL2:.*]] = phi float [ %[[ORIG_VAL]], %omp.private.copy3 ]
+// CHECK-NEXT: %[[PRIV_ALLOC3:.*]] = phi ptr [ %[[PRIV_ALLOC2]], %omp.private.copy3 ]
+// CHECK-NEXT: store float %[[ORIG_VAL2]], ptr %[[PRIV_ALLOC3]], align 4
 // CHECK-NEXT: br label %[[PRIV_CONT:.*]]
 
 // Check that the privatizer's continuation block yileds the private clone's
 // address.
 // CHECK: [[PRIV_CONT]]:
-// CHECK-NEXT:   %[[PRIV_ALLOC3:.*]] = phi ptr [ %[[PRIV_ALLOC2]], %[[PRIV_BB3]] ]
+// CHECK-NEXT:   %[[PRIV_ALLOC4:.*]] = phi ptr [ %[[PRIV_ALLOC3]], %[[PRIV_BB3]] ]
 // CHECK-NEXT:   br label %[[PAR_REG:.*]]
 
 // Check that the body of the parallel region loads from the private clone.
 // CHECK: [[PAR_REG]]:
-// CHECK:        %{{.*}} = load float, ptr %[[PRIV_ALLOC3]], align 4
+// CHECK:        %{{.*}} = load float, ptr %[[PRIV_ALLOC2]], align 4
 
 omp.private {type = firstprivate} @multi_block.privatizer : !llvm.ptr alloc {
 ^bb0(%arg0: !llvm.ptr):
diff --git a/mlir/test/Target/LLVMIR/openmp-private.mlir b/mlir/test/Target/LLVMIR/openmp-private.mlir
index 6153e5685c29fd..5407f97286eb1a 100644
--- a/mlir/test/Target/LLVMIR/openmp-private.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-private.mlir
@@ -104,6 +104,9 @@ llvm.func @parallel_op_private_multi_block(%arg0: !llvm.ptr) {
 // CHECK: omp.par.entry:
 // CHECK:  %[[ORIG_PTR_PTR:.*]] = getelementptr { ptr }, ptr %{{.*}}, i32 0, i32 0
 // CHECK:  %[[ORIG_PTR:.*]] = load ptr, ptr %[[ORIG_PTR_PTR]], align 8
+// CHECK:  br label %omp.private.latealloc
+
+// CHECK: omp.private.latealloc:
 // CHECK:   br label %[[PRIV_BB1:.*]]
 
 // Check contents of the first block in the `alloc` region.
@@ -151,8 +154,7 @@ omp.private {type = private} @multi_block.privatizer : !llvm.ptr alloc {
 // CHECK:         omp.par.region:
 // CHECK:           br label %[[PAR_REG_BEG:.*]]
 // CHECK:         [[PAR_REG_BEG]]:
-// CHECK:           %[[PRIVATIZER_GEP:.*]] = getelementptr double, ptr @_QQfoo, i64 111
-// CHECK:           call void @bar(ptr %[[PRIVATIZER_GEP]])
+// CHECK:           call void @bar(ptr getelementptr (double, ptr @_QQfoo, i64 111))
 // CHECK:           call void @bar(ptr getelementptr (double, ptr @_QQfoo, i64 222))
 llvm.func @lower_region_with_addressof() {
   %0 = llvm.mlir.constant(1 : i64) : i64

>From ab2b17512cda90305d5bea77b8e8fa119ab78f25 Mon Sep 17 00:00:00 2001
From: David Truby <david.truby at arm.com>
Date: Wed, 16 Oct 2024 14:48:59 +0100
Subject: [PATCH 69/82] [flang] Link to libatomic with openmp and rtlib=libgcc
 (#112202)

Currently when using OpenMP atomics we depend on some symbols from
libatomic. These symbols are provided in a separate library for the
libgcc runtime, so we should link to that when rtlib=libgcc.

For the compiler-rt case, the presence and location of the symbols is
dependent on how compiler-rt itself was built so we cannot make that
decision for the user. As such no extra flags are added in that case.
---
 clang/lib/Driver/ToolChains/CommonArgs.cpp | 10 ++++++++++
 flang/test/Driver/atomic.f90               |  5 +++++
 2 files changed, 15 insertions(+)
 create mode 100644 flang/test/Driver/atomic.f90

diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 3dd86ab7b99ca9..e662c3f0d2fa8f 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1294,6 +1294,16 @@ void tools::addFortranRuntimeLibs(const ToolChain &TC, const ArgList &Args,
     CmdArgs.push_back("-lFortranRuntime");
     CmdArgs.push_back("-lFortranDecimal");
   }
+
+  // libomp needs libatomic for atomic operations if using libgcc
+  if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
+                   options::OPT_fno_openmp, false)) {
+    Driver::OpenMPRuntimeKind OMPRuntime =
+        TC.getDriver().getOpenMPRuntime(Args);
+    ToolChain::RuntimeLibType RuntimeLib = TC.GetRuntimeLibType(Args);
+    if (OMPRuntime == Driver::OMPRT_OMP && RuntimeLib == ToolChain::RLT_Libgcc)
+      CmdArgs.push_back("-latomic");
+  }
 }
 
 void tools::addFortranRuntimeLibraryPath(const ToolChain &TC,
diff --git a/flang/test/Driver/atomic.f90 b/flang/test/Driver/atomic.f90
new file mode 100644
index 00000000000000..0fb3b428f694c1
--- /dev/null
+++ b/flang/test/Driver/atomic.f90
@@ -0,0 +1,5 @@
+!RUN: %flang --target=aarch64-unknown-linux-gnu -fuse-ld=ld -fopenmp -rtlib=libgcc -### %s 2>&1 | FileCheck --check-prefixes=GCC %s
+!RUN: %flang --target=aarch64-unknown-linux-gnu -fuse-ld=ld -fopenmp -rtlib=compiler-rt -### %s 2>&1 | FileCheck --check-prefixes=CRT %s
+
+!GCC: -latomic
+!CRT-NOT: -latomic

>From 91b5bef358e6763c5e18e34b1bc37e64114b3e04 Mon Sep 17 00:00:00 2001
From: David Truby <david.truby at arm.com>
Date: Wed, 16 Oct 2024 14:49:30 +0100
Subject: [PATCH 70/82] [flang] Tighten requirements on some glibc float128
 functions (#110651)

j0l, j1l, jnl, y0l, y1l and ynl are glibc extensions rather than
standard POSIX functions, and so are not available in every Linux libc.
This patch checks if `__GLIBC__` and `_GNU_SOURCE` are defined before
using
these functions.

This patch allows the float128 runtime to build with musl libc on Linux.
---
 flang/lib/Evaluate/intrinsics-library.cpp | 2 +-
 flang/runtime/Float128Math/math-entries.h | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/flang/lib/Evaluate/intrinsics-library.cpp b/flang/lib/Evaluate/intrinsics-library.cpp
index ee4df2dbd11327..bb439a6bb3a746 100644
--- a/flang/lib/Evaluate/intrinsics-library.cpp
+++ b/flang/lib/Evaluate/intrinsics-library.cpp
@@ -417,7 +417,7 @@ template <> struct HostRuntimeLibrary<double, LibraryVersion::LibmExtensions> {
   static_assert(map.Verify(), "map must be sorted");
 };
 
-#if HAS_FLOAT80 || HAS_LDBL128
+#if defined(__GLIBC__) && (HAS_FLOAT80 || HAS_LDBL128)
 template <>
 struct HostRuntimeLibrary<long double, LibraryVersion::LibmExtensions> {
   using F = FuncPointer<long double, long double>;
diff --git a/flang/runtime/Float128Math/math-entries.h b/flang/runtime/Float128Math/math-entries.h
index 90a983b787f537..4600c726d72825 100644
--- a/flang/runtime/Float128Math/math-entries.h
+++ b/flang/runtime/Float128Math/math-entries.h
@@ -187,9 +187,6 @@ DEFINE_SIMPLE_ALIAS(Hypot, std::hypot)
 DEFINE_SIMPLE_ALIAS(Ilogb, std::ilogb)
 DEFINE_SIMPLE_ALIAS(Isinf, std::isinf)
 DEFINE_SIMPLE_ALIAS(Isnan, std::isnan)
-DEFINE_SIMPLE_ALIAS(J0, j0l)
-DEFINE_SIMPLE_ALIAS(J1, j1l)
-DEFINE_SIMPLE_ALIAS(Jn, jnl)
 DEFINE_SIMPLE_ALIAS(Ldexp, std::ldexp)
 DEFINE_SIMPLE_ALIAS(Lgamma, std::lgamma)
 DEFINE_SIMPLE_ALIAS(Llround, std::llround)
@@ -207,9 +204,15 @@ DEFINE_SIMPLE_ALIAS(Tan, std::tan)
 DEFINE_SIMPLE_ALIAS(Tanh, std::tanh)
 DEFINE_SIMPLE_ALIAS(Tgamma, std::tgamma)
 DEFINE_SIMPLE_ALIAS(Trunc, std::trunc)
+
+#if defined(__GLIBC__) && defined(_GNU_SOURCE)
+DEFINE_SIMPLE_ALIAS(J0, j0l)
+DEFINE_SIMPLE_ALIAS(J1, j1l)
+DEFINE_SIMPLE_ALIAS(Jn, jnl)
 DEFINE_SIMPLE_ALIAS(Y0, y0l)
 DEFINE_SIMPLE_ALIAS(Y1, y1l)
 DEFINE_SIMPLE_ALIAS(Yn, ynl)
+#endif
 
 // Use numeric_limits to produce infinity of the right type.
 #define F128_RT_INFINITY \

>From 87f126243beb69b8b02e5cd4df762bc8a6f1f8cc Mon Sep 17 00:00:00 2001
From: Dmitry Vasilyev <dvassiliev at accesssoftek.com>
Date: Wed, 16 Oct 2024 17:52:16 +0400
Subject: [PATCH 71/82] [lldb][test] Skip Test*FromStdModule tests on Linux for
 now (#112530)

This is the alternative to #98701.
See for more details:
https://reviews.llvm.org/D139361
https://discourse.llvm.org/t/lldb-test-failures-on-linux/80095
---
 .../expression/import-std-module/array/TestArrayFromStdModule.py | 1 +
 .../TestDbgInfoContentVectorFromStdModule.py                     | 1 +
 .../vector-of-vectors/TestVectorOfVectorsFromStdModule.py        | 1 +
 3 files changed, 3 insertions(+)

diff --git a/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py
index 13ab6b0c9ac1fb..bafc7628296217 100644
--- a/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py
@@ -10,6 +10,7 @@
 class TestCase(TestBase):
     @add_test_categories(["libc++"])
     @skipIf(compiler=no_match("clang"))
+    @skipIfLinux  # https://discourse.llvm.org/t/lldb-test-failures-on-linux/80095
     def test(self):
         self.build()
 
diff --git a/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
index 1c32222e64f14c..71eaeef20e792d 100644
--- a/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
@@ -14,6 +14,7 @@ class TestDbgInfoContentVector(TestBase):
     @skipIf(compiler="clang", compiler_version=["<", "12.0"])
     @skipIf(macos_version=["<", "14.0"])
     @skipIfDarwin  # https://github.com/llvm/llvm-project/issues/106475
+    @skipIfLinux  # https://discourse.llvm.org/t/lldb-test-failures-on-linux/80095
     def test(self):
         self.build()
 
diff --git a/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
index a1f33271f39d2f..e9415fd53651f7 100644
--- a/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
@@ -10,6 +10,7 @@
 class TestVectorOfVectors(TestBase):
     @add_test_categories(["libc++"])
     @skipIf(compiler=no_match("clang"))
+    @skipIfLinux  # https://discourse.llvm.org/t/lldb-test-failures-on-linux/80095
     def test(self):
         self.build()
 

>From b333edd0d6da744c099ad3ff3b5fbd2d4e4dd45a Mon Sep 17 00:00:00 2001
From: Boaz Brickner <brickner at google.com>
Date: Wed, 16 Oct 2024 16:02:25 +0200
Subject: [PATCH 72/82] [clang] When checking for covariant return types, make
 sure the pointers or references are to *classes* (#111856)

https://eel.is/c++draft/class.virtual#8.1

This prevents overriding methods with non class return types that have
less cv-qualification.

Fixes: #111742
---
 clang/docs/ReleaseNotes.rst             | 13 +++++++++++++
 clang/lib/Sema/SemaDeclCXX.cpp          |  2 +-
 clang/test/SemaCXX/virtual-override.cpp |  6 ++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 33eb9a2b5804a0..dc5564b6db119f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -99,6 +99,19 @@ C++ Specific Potentially Breaking Changes
     // Was error, now evaluates to false.
     constexpr bool b = f() == g();
 
+- Clang will now correctly not consider pointers to non classes for covariance.
+
+  .. code-block:: c++
+
+    struct A {
+      virtual const int *f() const;
+    };
+    struct B : A {
+      // Return type has less cv-qualification but doesn't point to a class.
+      // Error will be generated.
+      int *f() const override;
+    };
+
 - The warning ``-Wdeprecated-literal-operator`` is now on by default, as this is
   something that WG21 has shown interest in removing from the language. The
   result is that anyone who is compiling with ``-Werror`` should see this
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 75d82c12e0c1f3..38f808a470aa87 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -18273,7 +18273,7 @@ bool Sema::CheckOverridingFunctionReturnType(const CXXMethodDecl *New,
   }
 
   // The return types aren't either both pointers or references to a class type.
-  if (NewClassTy.isNull()) {
+  if (NewClassTy.isNull() || !NewClassTy->isStructureOrClassType()) {
     Diag(New->getLocation(),
          diag::err_different_return_type_for_overriding_virtual_function)
         << New->getDeclName() << NewTy << OldTy
diff --git a/clang/test/SemaCXX/virtual-override.cpp b/clang/test/SemaCXX/virtual-override.cpp
index 72abfc3cf51e1f..d37c275d46baeb 100644
--- a/clang/test/SemaCXX/virtual-override.cpp
+++ b/clang/test/SemaCXX/virtual-override.cpp
@@ -19,10 +19,12 @@ struct b { };
   
 class A {
   virtual a* f(); // expected-note{{overridden virtual function is here}}
+  virtual int *g(); // expected-note{{overridden virtual function is here}}
 };
 
 class B : A {
   virtual b* f(); // expected-error{{return type of virtual function 'f' is not covariant with the return type of the function it overrides ('b *' is not derived from 'a *')}}
+  virtual char *g(); // expected-error{{virtual function 'g' has a different return type ('char *') than the function it overrides (which has return type 'int *')}}
 };
 
 }
@@ -83,11 +85,15 @@ struct a { };
 class A {
   virtual const a* f(); 
   virtual a* g(); // expected-note{{overridden virtual function is here}}
+  virtual const int* h(); // expected-note{{overridden virtual function is here}}
+  virtual int* i(); // expected-note{{overridden virtual function is here}}
 };
 
 class B : A {
   virtual a* f(); 
   virtual const a* g(); // expected-error{{return type of virtual function 'g' is not covariant with the return type of the function it overrides (class type 'const a *' is more qualified than class type 'a *'}}
+  virtual int* h();  // expected-error{{virtual function 'h' has a different return type ('int *') than the function it overrides (which has return type 'const int *')}}
+  virtual const int* i(); // expected-error{{virtual function 'i' has a different return type ('const int *') than the function it overrides (which has return type 'int *')}}
 };
 
 }

>From cba7b369b2a511082897fc5dc5a9c95a36c2743d Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Wed, 16 Oct 2024 07:20:18 -0700
Subject: [PATCH 73/82] [Clang][TableGen] Use const pointers for various Init
 objects in MveEmitter (#112320)

Use const pointers for various Init objects in MveEmitter. This is a
part of effort to have better const correctness in TableGen backends:


https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089
---
 clang/utils/TableGen/MveEmitter.cpp | 51 +++++++++++++++--------------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp
index 915e914d6b9287..51e570944b49b6 100644
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@@ -1033,15 +1033,15 @@ class EmitterBase {
   // to expand Tablegen classes like 'Vector' which mean something different in
   // each member of a parametric family.
   const Type *getType(const Record *R, const Type *Param);
-  const Type *getType(DagInit *D, const Type *Param);
-  const Type *getType(Init *I, const Type *Param);
+  const Type *getType(const DagInit *D, const Type *Param);
+  const Type *getType(const Init *I, const Type *Param);
 
   // Functions that translate the Tablegen representation of an intrinsic's
   // code generation into a collection of Value objects (which will then be
   // reprocessed to read out the actual C++ code included by CGBuiltin.cpp).
-  Result::Ptr getCodeForDag(DagInit *D, const Result::Scope &Scope,
+  Result::Ptr getCodeForDag(const DagInit *D, const Result::Scope &Scope,
                             const Type *Param);
-  Result::Ptr getCodeForDagArg(DagInit *D, unsigned ArgNum,
+  Result::Ptr getCodeForDagArg(const DagInit *D, unsigned ArgNum,
                                const Result::Scope &Scope, const Type *Param);
   Result::Ptr getCodeForArg(unsigned ArgNum, const Type *ArgType, bool Promote,
                             bool Immediate);
@@ -1060,10 +1060,10 @@ class EmitterBase {
   void EmitBuiltinAliases(raw_ostream &OS);
 };
 
-const Type *EmitterBase::getType(Init *I, const Type *Param) {
-  if (auto Dag = dyn_cast<DagInit>(I))
+const Type *EmitterBase::getType(const Init *I, const Type *Param) {
+  if (const auto *Dag = dyn_cast<DagInit>(I))
     return getType(Dag, Param);
-  if (auto Def = dyn_cast<DefInit>(I))
+  if (const auto *Def = dyn_cast<DefInit>(I))
     return getType(Def->getDef(), Param);
 
   PrintFatalError("Could not convert this value into a type");
@@ -1088,7 +1088,7 @@ const Type *EmitterBase::getType(const Record *R, const Type *Param) {
   PrintFatalError(R->getLoc(), "Could not convert this record into a type");
 }
 
-const Type *EmitterBase::getType(DagInit *D, const Type *Param) {
+const Type *EmitterBase::getType(const DagInit *D, const Type *Param) {
   // The meat of the getType system: types in the Tablegen are represented by a
   // dag whose operators select sub-cases of this function.
 
@@ -1156,7 +1156,8 @@ const Type *EmitterBase::getType(DagInit *D, const Type *Param) {
   PrintFatalError("Bad operator in type dag expression");
 }
 
-Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
+Result::Ptr EmitterBase::getCodeForDag(const DagInit *D,
+                                       const Result::Scope &Scope,
                                        const Type *Param) {
   const Record *Op = cast<DefInit>(D->getOperator())->getDef();
 
@@ -1199,14 +1200,14 @@ Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
     Result::Ptr Arg = getCodeForDagArg(D, 0, Scope, Param);
 
     const Type *Ty = nullptr;
-    if (auto *DI = dyn_cast<DagInit>(D->getArg(0)))
+    if (const auto *DI = dyn_cast<DagInit>(D->getArg(0)))
       if (auto *PTy = dyn_cast<PointerType>(getType(DI->getOperator(), Param)))
         Ty = PTy->getPointeeType();
     if (!Ty)
       PrintFatalError("'address' pointer argument should be a pointer");
 
     unsigned Alignment;
-    if (auto *II = dyn_cast<IntInit>(D->getArg(1))) {
+    if (const auto *II = dyn_cast<IntInit>(D->getArg(1))) {
       Alignment = II->getValue();
     } else {
       PrintFatalError("'address' alignment argument should be an integer");
@@ -1267,10 +1268,10 @@ Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
   }
 }
 
-Result::Ptr EmitterBase::getCodeForDagArg(DagInit *D, unsigned ArgNum,
+Result::Ptr EmitterBase::getCodeForDagArg(const DagInit *D, unsigned ArgNum,
                                           const Result::Scope &Scope,
                                           const Type *Param) {
-  Init *Arg = D->getArg(ArgNum);
+  const Init *Arg = D->getArg(ArgNum);
   StringRef Name = D->getArgNameStr(ArgNum);
 
   if (!Name.empty()) {
@@ -1286,18 +1287,18 @@ Result::Ptr EmitterBase::getCodeForDagArg(DagInit *D, unsigned ArgNum,
   // Sometimes the Arg is a bit. Prior to multiclass template argument
   // checking, integers would sneak through the bit declaration,
   // but now they really are bits.
-  if (auto *BI = dyn_cast<BitInit>(Arg))
+  if (const auto *BI = dyn_cast<BitInit>(Arg))
     return std::make_shared<IntLiteralResult>(getScalarType("u32"),
                                               BI->getValue());
 
-  if (auto *II = dyn_cast<IntInit>(Arg))
+  if (const auto *II = dyn_cast<IntInit>(Arg))
     return std::make_shared<IntLiteralResult>(getScalarType("u32"),
                                               II->getValue());
 
-  if (auto *DI = dyn_cast<DagInit>(Arg))
+  if (const auto *DI = dyn_cast<DagInit>(Arg))
     return getCodeForDag(DI, Scope, Param);
 
-  if (auto *DI = dyn_cast<DefInit>(Arg)) {
+  if (const auto *DI = dyn_cast<DefInit>(Arg)) {
     const Record *Rec = DI->getDef();
     if (Rec->isSubClassOf("Type")) {
       const Type *T = getType(Rec, Param);
@@ -1307,7 +1308,7 @@ Result::Ptr EmitterBase::getCodeForDagArg(DagInit *D, unsigned ArgNum,
 
   PrintError("bad DAG argument type for code generation");
   PrintNote("DAG: " + D->getAsString());
-  if (TypedInit *Typed = dyn_cast<TypedInit>(Arg))
+  if (const auto *Typed = dyn_cast<TypedInit>(Arg))
     PrintNote("argument type: " + Typed->getType()->getAsString());
   PrintFatalNote("argument number " + Twine(ArgNum) + ": " + Arg->getAsString());
 }
@@ -1379,13 +1380,13 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, const Record *R,
   HeaderOnly = R->getValueAsBit("headerOnly");
 
   // Process the intrinsic's argument list.
-  DagInit *ArgsDag = R->getValueAsDag("args");
+  const DagInit *ArgsDag = R->getValueAsDag("args");
   Result::Scope Scope;
   for (unsigned i = 0, e = ArgsDag->getNumArgs(); i < e; ++i) {
-    Init *TypeInit = ArgsDag->getArg(i);
+    const Init *TypeInit = ArgsDag->getArg(i);
 
     bool Promote = true;
-    if (auto TypeDI = dyn_cast<DefInit>(TypeInit))
+    if (const auto *TypeDI = dyn_cast<DefInit>(TypeInit))
       if (TypeDI->getDef()->isSubClassOf("unpromoted"))
         Promote = false;
 
@@ -1397,7 +1398,7 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, const Record *R,
     // If the argument is a subclass of Immediate, record the details about
     // what values it can take, for Sema checking.
     bool Immediate = false;
-    if (auto TypeDI = dyn_cast<DefInit>(TypeInit)) {
+    if (const auto *TypeDI = dyn_cast<DefInit>(TypeInit)) {
       const Record *TypeRec = TypeDI->getDef();
       if (TypeRec->isSubClassOf("Immediate")) {
         Immediate = true;
@@ -1444,7 +1445,7 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, const Record *R,
 
   // Finally, go through the codegen dag and translate it into a Result object
   // (with an arbitrary DAG of depended-on Results hanging off it).
-  DagInit *CodeDag = R->getValueAsDag("codegen");
+  const DagInit *CodeDag = R->getValueAsDag("codegen");
   const Record *MainOp = cast<DefInit>(CodeDag->getOperator())->getDef();
   if (MainOp->isSubClassOf("CustomCodegen")) {
     // Or, if it's the special case of CustomCodegen, just accumulate
@@ -1456,9 +1457,9 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, const Record *R,
       StringRef Name = CodeDag->getArgNameStr(i);
       if (Name.empty()) {
         PrintFatalError("Operands to CustomCodegen should have names");
-      } else if (auto *II = dyn_cast<IntInit>(CodeDag->getArg(i))) {
+      } else if (const auto *II = dyn_cast<IntInit>(CodeDag->getArg(i))) {
         CustomCodeGenArgs[std::string(Name)] = itostr(II->getValue());
-      } else if (auto *SI = dyn_cast<StringInit>(CodeDag->getArg(i))) {
+      } else if (const auto *SI = dyn_cast<StringInit>(CodeDag->getArg(i))) {
         CustomCodeGenArgs[std::string(Name)] = std::string(SI->getValue());
       } else {
         PrintFatalError("Operands to CustomCodegen should be integers");

>From 6924fc03260370876f7091ba06cdc350989ac3c5 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Wed, 16 Oct 2024 07:21:10 -0700
Subject: [PATCH 74/82] [LLVM] Add `Intrinsic::getDeclarationIfExists`
 (#112428)

Add `Intrinsic::getDeclarationIfExists` to lookup an existing
declaration of an intrinsic in a `Module`.
---
 llvm/include/llvm/IR/Intrinsics.h                  | 10 ++++++++++
 llvm/lib/Analysis/LazyValueInfo.cpp                |  2 +-
 llvm/lib/Analysis/ScalarEvolution.cpp              | 12 ++++++------
 llvm/lib/IR/Intrinsics.cpp                         | 10 ++++++++++
 llvm/lib/LTO/LTO.cpp                               | 14 +++++++-------
 .../Target/AMDGPU/AMDGPULowerKernelArguments.cpp   |  4 ++--
 .../Target/AMDGPU/AMDGPULowerKernelAttributes.cpp  |  3 +--
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp          |  2 +-
 llvm/lib/Transforms/IPO/ExpandVariadics.cpp        |  5 +++--
 llvm/lib/Transforms/IPO/GlobalDCE.cpp              |  6 +++---
 llvm/lib/Transforms/IPO/GlobalSplit.cpp            |  8 ++++----
 llvm/lib/Transforms/IPO/LowerTypeTests.cpp         |  6 +++---
 llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp   | 10 +++++-----
 llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp     | 13 +++++++------
 .../Instrumentation/IndirectCallPromotion.cpp      |  2 +-
 .../Transforms/Instrumentation/InstrProfiling.cpp  | 12 ++++++------
 llvm/lib/Transforms/Scalar/GuardWidening.cpp       |  8 ++++----
 llvm/lib/Transforms/Scalar/IndVarSimplify.cpp      |  4 ++--
 llvm/lib/Transforms/Scalar/JumpThreading.cpp       |  4 ++--
 llvm/lib/Transforms/Scalar/LoopPredication.cpp     |  6 +++---
 llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp |  4 ++--
 .../Transforms/Scalar/LowerWidenableCondition.cpp  |  4 ++--
 llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp  |  4 ++--
 llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp  |  4 ++--
 24 files changed, 89 insertions(+), 68 deletions(-)

diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 49f4fe4c5c3d7f..e893295e3272b9 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -102,6 +102,16 @@ namespace Intrinsic {
   inline Function *getDeclaration(Module *M, ID id, ArrayRef<Type *> Tys = {}) {
     return getOrInsertDeclaration(M, id, Tys);
   }
+
+  /// Look up the Function declaration of the intrinsic \p id in the Module
+  /// \p M and return it if it exists. Otherwise, return nullptr. This version
+  /// supports non-overloaded intrinsics.
+  Function *getDeclarationIfExists(const Module *M, ID id);
+
+  /// This version supports overloaded intrinsics.
+  Function *getDeclarationIfExists(Module *M, ID id, ArrayRef<Type *> Tys,
+                                   FunctionType *FT = nullptr);
+
   /// Looks up Name in NameTable via binary search. NameTable must be sorted
   /// and all entries must start with "llvm.".  If NameTable contains an exact
   /// match for Name or a prefix of Name followed by a dot, its index in
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 30dc4ae30dbfa5..10ad4708596cb3 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -1613,7 +1613,7 @@ LazyValueInfoImpl &LazyValueInfo::getOrCreateImpl(const Module *M) {
     assert(M && "getCache() called with a null Module");
     const DataLayout &DL = M->getDataLayout();
     Function *GuardDecl =
-        M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard));
+        Intrinsic::getDeclarationIfExists(M, Intrinsic::experimental_guard);
     PImpl = new LazyValueInfoImpl(AC, DL, GuardDecl);
   }
   return *static_cast<LazyValueInfoImpl *>(PImpl);
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 97ea405a5267ae..a3ba8e03781910 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -11665,8 +11665,8 @@ bool ScalarEvolution::isBasicBlockEntryGuardedByCond(const BasicBlock *BB,
   }
 
   // Check conditions due to any @llvm.experimental.guard intrinsics.
-  auto *GuardDecl = F.getParent()->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_guard));
+  auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+      F.getParent(), Intrinsic::experimental_guard);
   if (GuardDecl)
     for (const auto *GU : GuardDecl->users())
       if (const auto *Guard = dyn_cast<IntrinsicInst>(GU))
@@ -13615,8 +13615,8 @@ ScalarEvolution::ScalarEvolution(Function &F, TargetLibraryInfo &TLI,
   // ScalarEvolution to optimize based on those guards.  For now we prefer to be
   // efficient in lieu of being smart in that rather obscure case.
 
-  auto *GuardDecl = F.getParent()->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_guard));
+  auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+      F.getParent(), Intrinsic::experimental_guard);
   HasGuards = GuardDecl && !GuardDecl->use_empty();
 }
 
@@ -15593,8 +15593,8 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
   }
 
   // Second, collect information from llvm.experimental.guards dominating the loop.
-  auto *GuardDecl = SE.F.getParent()->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_guard));
+  auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+      SE.F.getParent(), Intrinsic::experimental_guard);
   if (GuardDecl)
     for (const auto *GU : GuardDecl->users())
       if (const auto *Guard = dyn_cast<IntrinsicInst>(GU))
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index ff8b4b7a020c2f..1b92daf15b463e 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -724,6 +724,16 @@ Function *Intrinsic::getOrInsertDeclaration(Module *M, ID id,
           .getCallee());
 }
 
+Function *Intrinsic::getDeclarationIfExists(const Module *M, ID id) {
+  return M->getFunction(getName(id));
+}
+
+Function *Intrinsic::getDeclarationIfExists(Module *M, ID id,
+                                            ArrayRef<Type *> Tys,
+                                            FunctionType *FT) {
+  return M->getFunction(getName(id, Tys, M, FT));
+}
+
 // This defines the "Intrinsic::getIntrinsicForClangBuiltin()" method.
 #define GET_LLVM_INTRINSIC_FOR_CLANG_BUILTIN
 #include "llvm/IR/IntrinsicImpl.inc"
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 90c4e2c3cd131c..0f53c608512171 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1120,13 +1120,13 @@ Error LTO::checkPartiallySplit() {
   if (!ThinLTO.CombinedIndex.partiallySplitLTOUnits())
     return Error::success();
 
-  Function *TypeTestFunc = RegularLTO.CombinedModule->getFunction(
-      Intrinsic::getName(Intrinsic::type_test));
-  Function *TypeCheckedLoadFunc = RegularLTO.CombinedModule->getFunction(
-      Intrinsic::getName(Intrinsic::type_checked_load));
-  Function *TypeCheckedLoadRelativeFunc =
-      RegularLTO.CombinedModule->getFunction(
-          Intrinsic::getName(Intrinsic::type_checked_load_relative));
+  const Module *Combined = RegularLTO.CombinedModule.get();
+  Function *TypeTestFunc =
+      Intrinsic::getDeclarationIfExists(Combined, Intrinsic::type_test);
+  Function *TypeCheckedLoadFunc =
+      Intrinsic::getDeclarationIfExists(Combined, Intrinsic::type_checked_load);
+  Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists(
+      Combined, Intrinsic::type_checked_load_relative);
 
   // First check if there are type tests / type checked loads in the
   // merged regular LTO module IR.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index d16c96f88e7b1e..6573176492b7f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -171,8 +171,8 @@ class PreloadKernelArgInfo {
   // Try to allocate SGPRs to preload implicit kernel arguments.
   void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
                                        IRBuilder<> &Builder) {
-    StringRef Name = Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
-    Function *ImplicitArgPtr = F.getParent()->getFunction(Name);
+    Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists(
+        F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
     if (!ImplicitArgPtr)
       return;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 7d66d07c9d0fb7..1bb5e794da7dd6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -78,8 +78,7 @@ class AMDGPULowerKernelAttributes : public ModulePass {
 Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
   auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
                                  : Intrinsic::amdgcn_dispatch_ptr;
-  StringRef Name = Intrinsic::getName(IntrinsicId);
-  return M.getFunction(Name);
+  return Intrinsic::getDeclarationIfExists(&M, IntrinsicId);
 }
 
 } // end anonymous namespace
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8c197f23149612..de9173e923ab5c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8786,7 +8786,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
     const Module *M = MF.getFunction().getParent();
     const GlobalValue *GV =
-        M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
+        Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
                                             SIInstrInfo::MO_ABS32_LO);
     return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index a7a01ca1055dd3..3121659edadd81 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -145,9 +145,10 @@ class VariadicABIInfo {
 // function here in the meantime to decouple from that discussion.
 Function *getPreexistingDeclaration(Module *M, Intrinsic::ID Id,
                                     ArrayRef<Type *> Tys = {}) {
+  if (Tys.empty())
+    return Intrinsic::getDeclarationIfExists(M, Id);
   auto *FT = Intrinsic::getType(M->getContext(), Id, Tys);
-  return M->getFunction(Tys.empty() ? Intrinsic::getName(Id)
-                                    : Intrinsic::getName(Id, Tys, M, FT));
+  return Intrinsic::getDeclarationIfExists(M, Id, Tys, FT);
 }
 
 class ExpandVariadics : public ModulePass {
diff --git a/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
index e36d524d7667ab..eca36fb31cea0e 100644
--- a/llvm/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
@@ -186,9 +186,9 @@ void GlobalDCEPass::ScanVTableLoad(Function *Caller, Metadata *TypeId,
 void GlobalDCEPass::ScanTypeCheckedLoadIntrinsics(Module &M) {
   LLVM_DEBUG(dbgs() << "Scanning type.checked.load intrinsics\n");
   Function *TypeCheckedLoadFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
-  Function *TypeCheckedLoadRelativeFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load_relative));
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_checked_load);
+  Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists(
+      &M, Intrinsic::type_checked_load_relative);
 
   auto scan = [&](Function *CheckedLoadFunc) {
     if (!CheckedLoadFunc)
diff --git a/llvm/lib/Transforms/IPO/GlobalSplit.cpp b/llvm/lib/Transforms/IPO/GlobalSplit.cpp
index fd49b745fd750e..320fd893935f84 100644
--- a/llvm/lib/Transforms/IPO/GlobalSplit.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalSplit.cpp
@@ -174,11 +174,11 @@ static bool splitGlobals(Module &M) {
   // llvm.type.checked.load intrinsics, which indicates that splitting globals
   // may be beneficial.
   Function *TypeTestFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test);
   Function *TypeCheckedLoadFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
-  Function *TypeCheckedLoadRelativeFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load_relative));
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_checked_load);
+  Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists(
+      &M, Intrinsic::type_checked_load_relative);
   if ((!TypeTestFunc || TypeTestFunc->use_empty()) &&
       (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()) &&
       (!TypeCheckedLoadRelativeFunc ||
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 519a4e9314a26b..3fcfc6a876776d 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1970,7 +1970,7 @@ static void dropTypeTests(Module &M, Function &TypeTestFunc) {
 
 bool LowerTypeTestsModule::lower() {
   Function *TypeTestFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test);
 
   if (DropTypeTests) {
     if (TypeTestFunc)
@@ -1979,7 +1979,7 @@ bool LowerTypeTestsModule::lower() {
     // except for in the case where we originally were performing ThinLTO but
     // decided not to in the backend.
     Function *PublicTypeTestFunc =
-        M.getFunction(Intrinsic::getName(Intrinsic::public_type_test));
+        Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test);
     if (PublicTypeTestFunc)
       dropTypeTests(M, *PublicTypeTestFunc);
     if (TypeTestFunc || PublicTypeTestFunc) {
@@ -2002,7 +2002,7 @@ bool LowerTypeTestsModule::lower() {
     return false;
 
   Function *ICallBranchFunnelFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::icall_branch_funnel));
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::icall_branch_funnel);
   if ((!TypeTestFunc || TypeTestFunc->use_empty()) &&
       (!ICallBranchFunnelFunc || ICallBranchFunnelFunc->use_empty()) &&
       !ExportSummary && !ImportSummary)
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 9bf29c46938eba..cd0e412bdf353b 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -123,7 +123,7 @@ void promoteTypeIds(Module &M, StringRef ModuleId) {
   };
 
   if (Function *TypeTestFunc =
-          M.getFunction(Intrinsic::getName(Intrinsic::type_test))) {
+          Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test)) {
     for (const Use &U : TypeTestFunc->uses()) {
       auto CI = cast<CallInst>(U.getUser());
       ExternalizeTypeId(CI, 1);
@@ -131,7 +131,7 @@ void promoteTypeIds(Module &M, StringRef ModuleId) {
   }
 
   if (Function *PublicTypeTestFunc =
-          M.getFunction(Intrinsic::getName(Intrinsic::public_type_test))) {
+          Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test)) {
     for (const Use &U : PublicTypeTestFunc->uses()) {
       auto CI = cast<CallInst>(U.getUser());
       ExternalizeTypeId(CI, 1);
@@ -139,15 +139,15 @@ void promoteTypeIds(Module &M, StringRef ModuleId) {
   }
 
   if (Function *TypeCheckedLoadFunc =
-          M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load))) {
+          Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_checked_load)) {
     for (const Use &U : TypeCheckedLoadFunc->uses()) {
       auto CI = cast<CallInst>(U.getUser());
       ExternalizeTypeId(CI, 2);
     }
   }
 
-  if (Function *TypeCheckedLoadRelativeFunc = M.getFunction(
-          Intrinsic::getName(Intrinsic::type_checked_load_relative))) {
+  if (Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists(
+          &M, Intrinsic::type_checked_load_relative)) {
     for (const Use &U : TypeCheckedLoadRelativeFunc->uses()) {
       auto CI = cast<CallInst>(U.getUser());
       ExternalizeTypeId(CI, 2);
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 59f986b4ca2664..45d32218f362c5 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -851,7 +851,7 @@ void llvm::updateVCallVisibilityInModule(
 void llvm::updatePublicTypeTestCalls(Module &M,
                                      bool WholeProgramVisibilityEnabledInLTO) {
   Function *PublicTypeTestFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::public_type_test));
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test);
   if (!PublicTypeTestFunc)
     return;
   if (hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) {
@@ -2247,12 +2247,13 @@ bool DevirtModule::run() {
     return false;
 
   Function *TypeTestFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test);
   Function *TypeCheckedLoadFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
-  Function *TypeCheckedLoadRelativeFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load_relative));
-  Function *AssumeFunc = M.getFunction(Intrinsic::getName(Intrinsic::assume));
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_checked_load);
+  Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists(
+      &M, Intrinsic::type_checked_load_relative);
+  Function *AssumeFunc =
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::assume);
 
   // Normally if there are no users of the devirtualization intrinsics in the
   // module, this pass has nothing to do. But if we are exporting, we also need
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 86637109d94083..43b8d5e6a8ce5b 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -944,7 +944,7 @@ computeVirtualCallSiteTypeInfoMap(Module &M, ModuleAnalysisManager &MAM,
   // Find out virtual calls by looking at users of llvm.type.checked.load in
   // that case.
   Function *TypeTestFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test);
   if (!TypeTestFunc || TypeTestFunc->use_empty())
     return;
 
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 929c787442057a..d7d809dfdd5f65 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -902,15 +902,15 @@ static bool needsRuntimeHookUnconditionally(const Triple &TT) {
 /// Check if the module contains uses of any profiling intrinsics.
 static bool containsProfilingIntrinsics(Module &M) {
   auto containsIntrinsic = [&](int ID) {
-    if (auto *F = M.getFunction(Intrinsic::getName(ID)))
+    if (auto *F = Intrinsic::getDeclarationIfExists(&M, ID))
       return !F->use_empty();
     return false;
   };
-  return containsIntrinsic(llvm::Intrinsic::instrprof_cover) ||
-         containsIntrinsic(llvm::Intrinsic::instrprof_increment) ||
-         containsIntrinsic(llvm::Intrinsic::instrprof_increment_step) ||
-         containsIntrinsic(llvm::Intrinsic::instrprof_timestamp) ||
-         containsIntrinsic(llvm::Intrinsic::instrprof_value_profile);
+  return containsIntrinsic(Intrinsic::instrprof_cover) ||
+         containsIntrinsic(Intrinsic::instrprof_increment) ||
+         containsIntrinsic(Intrinsic::instrprof_increment_step) ||
+         containsIntrinsic(Intrinsic::instrprof_timestamp) ||
+         containsIntrinsic(Intrinsic::instrprof_value_profile);
 }
 
 bool InstrLowerer::lower() {
diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index e7ff2a14469c5e..7fa9f42809091f 100644
--- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -980,11 +980,11 @@ StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
 PreservedAnalyses GuardWideningPass::run(Function &F,
                                          FunctionAnalysisManager &AM) {
   // Avoid requesting analyses if there are no guards or widenable conditions.
-  auto *GuardDecl = F.getParent()->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_guard));
+  auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+      F.getParent(), Intrinsic::experimental_guard);
   bool HasIntrinsicGuards = GuardDecl && !GuardDecl->use_empty();
-  auto *WCDecl = F.getParent()->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_widenable_condition));
+  auto *WCDecl = Intrinsic::getDeclarationIfExists(
+      F.getParent(), Intrinsic::experimental_widenable_condition);
   bool HasWidenableConditions = WCDecl && !WCDecl->use_empty();
   if (!HasIntrinsicGuards && !HasWidenableConditions)
     return PreservedAnalyses::all();
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 2668305e9c8447..ad68fc1f21e2c2 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -598,8 +598,8 @@ bool IndVarSimplify::simplifyAndExtend(Loop *L,
                                        LoopInfo *LI) {
   SmallVector<WideIVInfo, 8> WideIVs;
 
-  auto *GuardDecl = L->getBlocks()[0]->getModule()->getFunction(
-          Intrinsic::getName(Intrinsic::experimental_guard));
+  auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+      L->getBlocks()[0]->getModule(), Intrinsic::experimental_guard);
   bool HasGuards = GuardDecl && !GuardDecl->use_empty();
 
   SmallVector<PHINode *, 8> LoopPhis;
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 7a0b661a07799a..11fdc39464dfb6 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -296,8 +296,8 @@ bool JumpThreadingPass::runImpl(Function &F_, FunctionAnalysisManager *FAM_,
   DTU = std::move(DTU_);
   BFI = BFI_;
   BPI = BPI_;
-  auto *GuardDecl = F->getParent()->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_guard));
+  auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+      F->getParent(), Intrinsic::experimental_guard);
   HasGuards = GuardDecl && !GuardDecl->use_empty();
 
   // Reduce the number of instructions duplicated when optimizing strictly for
diff --git a/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
index 209b083a4e91a3..31694ad1fa508a 100644
--- a/llvm/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -1193,10 +1193,10 @@ bool LoopPredication::runOnLoop(Loop *Loop) {
 
   // There is nothing to do if the module doesn't use guards
   auto *GuardDecl =
-      M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard));
+      Intrinsic::getDeclarationIfExists(M, Intrinsic::experimental_guard);
   bool HasIntrinsicGuards = GuardDecl && !GuardDecl->use_empty();
-  auto *WCDecl = M->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_widenable_condition));
+  auto *WCDecl = Intrinsic::getDeclarationIfExists(
+      M, Intrinsic::experimental_widenable_condition);
   bool HasWidenableConditions =
       PredicateWidenableBranchGuards && WCDecl && !WCDecl->use_empty();
   if (!HasIntrinsicGuards && !HasWidenableConditions)
diff --git a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
index ce35349376c483..5f3e612e73b661 100644
--- a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
@@ -27,8 +27,8 @@ using namespace llvm;
 static bool lowerGuardIntrinsic(Function &F) {
   // Check if we can cheaply rule out the possibility of not having any work to
   // do.
-  auto *GuardDecl = F.getParent()->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_guard));
+  auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+      F.getParent(), Intrinsic::experimental_guard);
   if (!GuardDecl || GuardDecl->use_empty())
     return false;
 
diff --git a/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp b/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp
index 3c977b816a0506..ea2b419b17a59c 100644
--- a/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp
@@ -26,8 +26,8 @@ using namespace llvm;
 static bool lowerWidenableCondition(Function &F) {
   // Check if we can cheaply rule out the possibility of not having any work to
   // do.
-  auto *WCDecl = F.getParent()->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_widenable_condition));
+  auto *WCDecl = Intrinsic::getDeclarationIfExists(
+      F.getParent(), Intrinsic::experimental_widenable_condition);
   if (!WCDecl || WCDecl->use_empty())
     return false;
 
diff --git a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
index b9f88ba4e0780e..948466c675e974 100644
--- a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
+++ b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
@@ -56,8 +56,8 @@ static void turnToExplicitForm(CallInst *Guard, Function *DeoptIntrinsic) {
 static bool explicifyGuards(Function &F) {
   // Check if we can cheaply rule out the possibility of not having any work to
   // do.
-  auto *GuardDecl = F.getParent()->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_guard));
+  auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+      F.getParent(), Intrinsic::experimental_guard);
   if (!GuardDecl || GuardDecl->use_empty())
     return false;
 
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index f3f5ffb6b61b47..aa3cbc5e4bddc2 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -2920,8 +2920,8 @@ static bool collectUnswitchCandidates(
   // Whether or not we should also collect guards in the loop.
   bool CollectGuards = false;
   if (UnswitchGuards) {
-    auto *GuardDecl = L.getHeader()->getParent()->getParent()->getFunction(
-        Intrinsic::getName(Intrinsic::experimental_guard));
+    auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+        L.getHeader()->getParent()->getParent(), Intrinsic::experimental_guard);
     if (GuardDecl && !GuardDecl->use_empty())
       CollectGuards = true;
   }

>From dcc5ba4a4d94e9550ff02239c252f446ab3fdf19 Mon Sep 17 00:00:00 2001
From: Stefan Pintilie <stefanp at ca.ibm.com>
Date: Wed, 16 Oct 2024 10:25:09 -0400
Subject: [PATCH 75/82] [PowerPC] Add missing patterns for lround when i32 is
 returned. (#111863)

The patch adds support for lround when the output type of the rounding
is i32.
The support for a rounding result of type i64 existed before this patch.
---
 llvm/lib/Target/PowerPC/PPCInstrVSX.td        |  4 +
 .../CodeGen/PowerPC/scalar-rounding-ops.ll    | 84 +++++++++++++++++++
 2 files changed, 88 insertions(+)

diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index dd07892794d599..fe9ab22c576349 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -3606,6 +3606,10 @@ def : Pat<(i64 (lround f64:$S)),
           (i64 (MFVSRD (FCTID (XSRDPI $S))))>;
 def : Pat<(i64 (lround f32:$S)),
           (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
+def : Pat<(i32 (lround f64:$S)),
+          (i32 (MFVSRWZ (FCTIW (XSRDPI $S))))>;
+def : Pat<(i32 (lround f32:$S)),
+          (i32 (MFVSRWZ (FCTIW (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
 def : Pat<(i64 (llround f64:$S)),
           (i64 (MFVSRD (FCTID (XSRDPI $S))))>;
 def : Pat<(i64 (llround f32:$S)),
diff --git a/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll b/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll
index e950c0a2efac49..2be370f638d5bd 100644
--- a/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll
@@ -214,6 +214,48 @@ entry:
 
 declare i64 @llvm.lround.i64.f64(double)
 
+define dso_local i32 @test_lroundi32f64(double %d) local_unnamed_addr {
+; BE-LABEL: test_lroundi32f64:
+; BE:       # %bb.0: # %entry
+; BE-NEXT:    mflr r0
+; BE-NEXT:    stdu r1, -112(r1)
+; BE-NEXT:    std r0, 128(r1)
+; BE-NEXT:    .cfi_def_cfa_offset 112
+; BE-NEXT:    .cfi_offset lr, 16
+; BE-NEXT:    bl lround
+; BE-NEXT:    nop
+; BE-NEXT:    addi r1, r1, 112
+; BE-NEXT:    ld r0, 16(r1)
+; BE-NEXT:    mtlr r0
+; BE-NEXT:    blr
+;
+; CHECK-LABEL: test_lroundi32f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl lround
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; FAST-LABEL: test_lroundi32f64:
+; FAST:       # %bb.0: # %entry
+; FAST-NEXT:    xsrdpi f0, f1
+; FAST-NEXT:    fctiw f0, f0
+; FAST-NEXT:    mffprwz r3, f0
+; FAST-NEXT:    blr
+entry:
+  %0 = tail call i32 @llvm.lround.i32.f64(double %d)
+  ret i32 %0
+}
+
+declare i32 @llvm.lround.i32.f64(double)
+
 define dso_local i64 @test_lroundf(float %f) local_unnamed_addr {
 ; BE-LABEL: test_lroundf:
 ; BE:       # %bb.0: # %entry
@@ -256,6 +298,48 @@ entry:
 
 declare i64 @llvm.lround.i64.f32(float)
 
+define dso_local i32 @test_lroundi32f32(float %d) local_unnamed_addr {
+; BE-LABEL: test_lroundi32f32:
+; BE:       # %bb.0: # %entry
+; BE-NEXT:    mflr r0
+; BE-NEXT:    stdu r1, -112(r1)
+; BE-NEXT:    std r0, 128(r1)
+; BE-NEXT:    .cfi_def_cfa_offset 112
+; BE-NEXT:    .cfi_offset lr, 16
+; BE-NEXT:    bl lroundf
+; BE-NEXT:    nop
+; BE-NEXT:    addi r1, r1, 112
+; BE-NEXT:    ld r0, 16(r1)
+; BE-NEXT:    mtlr r0
+; BE-NEXT:    blr
+;
+; CHECK-LABEL: test_lroundi32f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl lroundf
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; FAST-LABEL: test_lroundi32f32:
+; FAST:       # %bb.0: # %entry
+; FAST-NEXT:    xsrdpi f0, f1
+; FAST-NEXT:    fctiw f0, f0
+; FAST-NEXT:    mffprwz r3, f0
+; FAST-NEXT:    blr
+entry:
+  %0 = tail call i32 @llvm.lround.i32.f32(float %d)
+  ret i32 %0
+}
+
+declare i32 @llvm.lround.i32.f32(float)
+
 define dso_local i64 @test_llround(double %d) local_unnamed_addr {
 ; BE-LABEL: test_llround:
 ; BE:       # %bb.0: # %entry

>From 7b4c8b35d43c0a17f222722487d7a2b4ceee0a26 Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2 at amd.com>
Date: Wed, 16 Oct 2024 10:27:44 -0400
Subject: [PATCH 76/82] [AMDGPU][True16][MC] VOP3 profile in True16 format
 (#109031)

Modify VOP3 profile and pesudo, and add encoding info for VOP3 True16
including DPP and DPP8 in true16 and fake16 format.

This patch applies true16/fake16 changes and asm/dasm changes to
V_ADD_NC_U16
V_ADD_NC_I16
V_SUB_NC_U16
V_SUB_NC_I16
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |   8 +-
 llvm/lib/Target/AMDGPU/VOP2Instructions.td    |   4 +-
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |  84 ++--
 llvm/lib/Target/AMDGPU/VOPInstructions.td     | 312 ++++++++++--
 .../AMDGPU/GlobalISel/inst-select-add.s16.mir |  40 +-
 .../test/CodeGen/AMDGPU/dpp_combine_gfx11.mir |  32 +-
 .../isel-amdgpu-cs-chain-preserve-cc.ll       |   6 +-
 llvm/test/MC/AMDGPU/gfx11_asm_vop3.s          | 208 +++++---
 llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s    | 192 +++----
 llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s     | 124 +++--
 llvm/test/MC/AMDGPU/gfx12_asm_vop3.s          | 156 +++---
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s    | 328 ++++++------
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s     | 176 ++++---
 .../Disassembler/AMDGPU/gfx11_dasm_vop3.txt   | 444 ++++++++++++----
 .../AMDGPU/gfx11_dasm_vop3_dpp16.txt          | 376 +++++++++++---
 .../AMDGPU/gfx11_dasm_vop3_dpp8.txt           | 196 +++++++-
 .../Disassembler/AMDGPU/gfx12_dasm_vop3.txt   | 372 +++++++++++---
 .../AMDGPU/gfx12_dasm_vop3_dpp16.txt          | 472 +++++++++++++++---
 .../AMDGPU/gfx12_dasm_vop3_dpp8.txt           | 292 ++++++++++-
 19 files changed, 2896 insertions(+), 926 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 087ca1f954464d..42a1ffb8a26d4a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2149,6 +2149,8 @@ class getAsmVOP3P <int NumSrcArgs, bit HasModifiers,
   string ret = dst#", "#src0#src1#src2#opsel#mods#clamp;
 }
 
+// FIXME-TRUE16 AsmVOP3OpSel will be deprecated after all
+// VOP3 16 bit instructions are replaced to true16 format
 class getAsmVOP3OpSel <int NumSrcArgs,
                        bit HasClamp,
                        bit HasOMod,
@@ -2237,8 +2239,9 @@ class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
   string clamp = !if(HasClamp, "$clamp", "");
   string omod = !if(HasOMod, "$omod", "");
 
-  string ret = dst#!if(!gt(NumSrcArgs,0),", "#src0#src1#src2#opsel#bytesel#3PMods#clamp#omod, "");
-
+  string ret = dst#!if(!eq(NumSrcArgs,0),
+                       "",
+                       !if(HasDst,", ", "")#src0#src1#src2#opsel#bytesel#3PMods#clamp#omod);
 }
 
 class getAsmVOP3DPP<string base> {
@@ -2733,6 +2736,7 @@ def VOP_F32_F32_F16_F16 : VOPProfile <[f32, f32, f16, f16]>;
 def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>;
 def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
 def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
+def VOP_I32_I32_I32_I16 : VOPProfile <[i32, i32, i32, i16]>;
 def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
 def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>;
 def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 639f9189cbe7e3..e83ea57c61df13 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1664,8 +1664,8 @@ multiclass VOP3Only_Realtriple_gfx11_gfx12<bits<10> op> :
   VOP3Only_Realtriple<GFX11Gen, op>, VOP3Only_Realtriple<GFX12Gen, op>;
 
 multiclass VOP3Only_Realtriple_t16_gfx11_gfx12<bits<10> op, string asmName, string OpName = NAME> :
-  VOP3Only_Realtriple_t16<GFX11Gen, op, asmName, OpName>,
-  VOP3Only_Realtriple_t16<GFX12Gen, op, asmName, OpName>;
+  VOP3_Realtriple_t16_gfx11<op, asmName, OpName, "", /*IsSingle*/1>,
+  VOP3_Realtriple_t16_gfx12<op, asmName, OpName, "", /*IsSingle*/1>;
 
 multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<bits<10> op, string asmName, string OpName = NAME> {
   defm OpName#"_t16": VOP3Only_Realtriple_t16_gfx11_gfx12<op, asmName, OpName#"_t16">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 78ca7a2f258cb3..34ecdb56e8689d 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -569,16 +569,10 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
                             getAsmVOP3OpSel<3, HasClamp, HasOMod,
                                             HasSrc0FloatMods, HasSrc1FloatMods,
                                             HasSrc2FloatMods>.ret);
-  let AsmVOP3DPP16 = !subst(", $src2_modifiers", "",
-                            getAsmVOP3DPP16<getAsmVOP3Base<3, 1, HasClamp, 1,
-                                            HasOMod, 0, 1, HasSrc0FloatMods,
-                                            HasSrc1FloatMods,
-                                            HasSrc2FloatMods>.ret>.ret);
-  let AsmVOP3DPP8 = !subst(", $src2_modifiers", "",
-                           getAsmVOP3DPP8<getAsmVOP3Base<3, 1, HasClamp, 1,
-                                          HasOMod, 0, 1, HasSrc0FloatMods,
-                                          HasSrc1FloatMods,
-                                          HasSrc2FloatMods>.ret>.ret);
+  let AsmVOP3Base = !subst(", $src2_modifiers", "",
+                    getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
+                    HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, 0/*Src1Mods*/,
+                    HasModifiers, DstVT>.ret);
 }
 
 class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT> :
@@ -636,8 +630,8 @@ let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
   defm V_MAXIMUM3_F16 : VOP3Inst <"v_maximum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmaximum3>;
 } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
 
-defm V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
-defm V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
+defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
+defm V_SUB_I16 : VOP3Inst_t16 <"v_sub_i16", VOP_I16_I16_I16>;
 
 defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
 defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
@@ -752,6 +746,8 @@ def : GCNPat<(DivergentBinFrag<or> (or_oneuse i64:$src0, i64:$src1), i64:$src2),
                               (i32 (EXTRACT_SUBREG $src1, sub1)),
                               (i32 (EXTRACT_SUBREG $src2, sub1))), sub1)>;
 
+} // End SubtargetPredicate = isGFX9Plus
+
 // FIXME: Probably should hardcode clamp bit in pseudo and avoid this.
 class OpSelBinOpClampPat<SDPatternOperator node,
                          Instruction inst> : GCNPat<
@@ -760,9 +756,14 @@ class OpSelBinOpClampPat<SDPatternOperator node,
   (inst $src0_modifiers, $src0, $src1_modifiers, $src1, DSTCLAMP.ENABLE, 0)
 >;
 
-def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>;
-def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>;
-} // End SubtargetPredicate = isGFX9Plus
+let SubtargetPredicate = isGFX9Plus, True16Predicate = NotHasTrue16BitInsts in {
+  def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>;
+  def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>;
+} // End SubtargetPredicate = isGFX9Plus, True16Predicate = NotHasTrue16BitInsts
+let True16Predicate = UseFakeTrue16Insts in {
+  def : OpSelBinOpClampPat<saddsat, V_ADD_I16_fake16_e64>;
+  def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_fake16_e64>;
+} // End True16Predicate = UseFakeTrue16Insts
 
 multiclass IMAD32_Pats <VOP3_Pseudo inst> {
   def : GCNPat <
@@ -871,21 +872,31 @@ let SubtargetPredicate = isGFX10Plus in {
     def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>;
   }
 
-  defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
-  defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>;
-
-  def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_e64>;
-  def : OpSelBinOpClampPat<usubsat, V_SUB_NC_U16_e64>;
-
-  // Undo sub x, c -> add x, -c canonicalization since c is more likely
-  // an inline immediate than -c.
-  def : GCNPat<
-    (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)),
-    (V_SUB_NC_U16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0)
-  >;
+  defm V_ADD_NC_U16 : VOP3Inst_t16 <"v_add_nc_u16", VOP_I16_I16_I16, add>;
+  defm V_SUB_NC_U16 : VOP3Inst_t16 <"v_sub_nc_u16", VOP_I16_I16_I16, sub>;
 
 } // End SubtargetPredicate = isGFX10Plus
 
+let True16Predicate = NotHasTrue16BitInsts, SubtargetPredicate = isGFX10Plus in {
+   def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_e64>;
+   def : OpSelBinOpClampPat<usubsat, V_SUB_NC_U16_e64>;
+   // Undo sub x, c -> add x, -c canonicalization since c is more likely
+   // an inline immediate than -c.
+   def : GCNPat<
+     (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)),
+     (V_SUB_NC_U16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0)
+   >;
+} // End True16Predicate = NotHasTrue16BitInsts, SubtargetPredicate = isGFX10Plus
+
+let True16Predicate = UseFakeTrue16Insts in {
+   def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_fake16_e64>;
+   def : OpSelBinOpClampPat<usubsat, V_SUB_NC_U16_fake16_e64>;
+   def : GCNPat<
+     (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)),
+     (V_SUB_NC_U16_fake16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0)
+   >;
+} // End True16Predicate = UseFakeTrue16Insts
+
 let SubtargetPredicate = isGFX12Plus in {
   let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
     defm V_PERMLANE16_VAR_B32  : VOP3Inst<"v_permlane16_var_b32",  VOP3_PERMLANE_VAR_Profile>;
@@ -1104,6 +1115,17 @@ multiclass VOP3_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
 multiclass VOP3Dot_Realtriple_gfx11_gfx12<bits<10> op> :
   VOP3Dot_Realtriple<GFX11Gen, op>, VOP3Dot_Realtriple<GFX12Gen, op>;
 
+multiclass VOP3_Realtriple_t16_gfx11_gfx12<bits<10> op, string asmName, string opName = NAME,
+                                           string pseudo_mnemonic = "", bit isSingle = 0> :
+  VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>,
+  VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
+
+multiclass VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<bits<10> op, string asmName, string opName = NAME,
+                                                      string pseudo_mnemonic = "", bit isSingle = 0> {
+  defm opName#"_t16": VOP3_Realtriple_t16_gfx11_gfx12<op, asmName, opName#"_t16", pseudo_mnemonic, isSingle>;
+  defm opName#"_fake16": VOP3_Realtriple_t16_gfx11_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
+}
+
 multiclass VOP3be_Real_gfx11_gfx12<bits<10> op, string opName, string asmName> :
   VOP3be_Real<GFX11Gen, op, opName, asmName>,
   VOP3be_Real<GFX12Gen, op, opName, asmName>;
@@ -1189,8 +1211,8 @@ defm V_DIV_SCALE_F32       : VOP3be_Real_gfx11_gfx12<0x2fc, "V_DIV_SCALE_F32", "
 defm V_DIV_SCALE_F64       : VOP3be_Real_gfx11_gfx12<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">;
 defm V_MAD_U64_U32_gfx11   : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">;
 defm V_MAD_I64_I32_gfx11   : VOP3be_Real_gfx11<0x2ff, "V_MAD_I64_I32_gfx11", "v_mad_i64_i32">;
-defm V_ADD_NC_U16          : VOP3Only_Realtriple_gfx11_gfx12<0x303>;
-defm V_SUB_NC_U16          : VOP3Only_Realtriple_gfx11_gfx12<0x304>;
+defm V_ADD_NC_U16          : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x303, "v_add_nc_u16">;
+defm V_SUB_NC_U16          : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x304, "v_sub_nc_u16">;
 defm V_MUL_LO_U16          : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x305, "v_mul_lo_u16">;
 defm V_CVT_PK_I16_F32      : VOP3_Realtriple_gfx11_gfx12<0x306>;
 defm V_CVT_PK_U16_F32      : VOP3_Realtriple_gfx11_gfx12<0x307>;
@@ -1198,8 +1220,8 @@ defm V_MAX_U16             : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30
 defm V_MAX_I16             : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30a, "v_max_i16">;
 defm V_MIN_U16             : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30b, "v_min_u16">;
 defm V_MIN_I16             : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30c, "v_min_i16">;
-defm V_ADD_NC_I16          : VOP3_Realtriple_with_name_gfx11_gfx12<0x30d, "V_ADD_I16", "v_add_nc_i16">;
-defm V_SUB_NC_I16          : VOP3_Realtriple_with_name_gfx11_gfx12<0x30e, "V_SUB_I16", "v_sub_nc_i16">;
+defm V_ADD_NC_I16          : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x30d, "v_add_nc_i16", "V_ADD_I16">;
+defm V_SUB_NC_I16          : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x30e, "v_sub_nc_i16", "V_SUB_I16">;
 defm V_PACK_B32_F16        : VOP3_Realtriple_gfx11_gfx12<0x311>;
 defm V_CVT_PK_NORM_I16_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x312, "V_CVT_PKNORM_I16_F16" , "v_cvt_pk_norm_i16_f16" >;
 defm V_CVT_PK_NORM_U16_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x313, "V_CVT_PKNORM_U16_F16" , "v_cvt_pk_norm_u16_f16" >;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 05a7d907d237ae..aab5dc7465d938 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -111,7 +111,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
 
   bit HasFP8DstByteSel = P.HasFP8DstByteSel;
 
-  let AsmOperands = !if(isVop3OpSel,
+  let AsmOperands = !if(!and(!not(P.IsTrue16), isVop3OpSel),
                         P.AsmVOP3OpSel,
                         !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));
 
@@ -178,6 +178,7 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemoni
   let SubtargetPredicate = ps.SubtargetPredicate;
   let WaveSizePredicate  = ps.WaveSizePredicate;
   let OtherPredicates    = ps.OtherPredicates;
+  let True16Predicate    = ps.True16Predicate;
   let AsmMatchConverter  = ps.AsmMatchConverter;
   let AsmVariantName     = ps.AsmVariantName;
   let Constraints        = ps.Constraints;
@@ -242,6 +243,41 @@ class VOP3a<VOPProfile P> : Enc64 {
   let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
 }
 
+// To avoid having different version of every type of operand depending on if
+// they are part of a True16 instruction or not, the operand encoding should be
+// the same for SGPR, imm, and VGPR_32 whether the instruction is True16 or not.
+class VOP3a_t16<VOPProfile P> : Enc64 {
+  bits<11> vdst;
+  bits<4> src0_modifiers;
+  bits<11> src0;
+  bits<3> src1_modifiers;
+  bits<11> src1;
+  bits<3> src2_modifiers;
+  bits<11> src2;
+  bits<1> clamp;
+  bits<2> omod;
+
+  let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
+  let Inst{8}     = !if(P.HasSrc0Mods, src0_modifiers{1}, 0);
+  let Inst{9}     = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
+  let Inst{10}    = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
+  // 16-bit select fields which can be interpreted as OpSel or hi/lo suffix
+  let Inst{11} = !if(P.HasSrc0Mods, src0_modifiers{2}, 0);
+  let Inst{12} = !if(P.HasSrc1Mods, src1_modifiers{2}, 0);
+  let Inst{13} = !if(P.HasSrc2Mods, src2_modifiers{2}, 0);
+  let Inst{14} = !if(!and(P.HasDst, P.HasSrc0Mods), src0_modifiers{3}, 0);
+  let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+
+  let Inst{31-26} = 0x35;
+  let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, 0);
+  let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
+  let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0);
+  let Inst{60-59} = !if(P.HasOMod, omod, 0);
+  let Inst{61}    = !if(P.HasSrc0Mods, src0_modifiers{0}, 0);
+  let Inst{62}    = !if(P.HasSrc1Mods, src1_modifiers{0}, 0);
+  let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
+}
+
 class VOP3a_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3a<p> {
   let Inst{11}    = !if(p.HasClamp, clamp{0}, 0);
   let Inst{25-17} = op;
@@ -272,6 +308,10 @@ class VOP3e_gfx10<bits<10> op, VOPProfile p> : VOP3a_gfx10<op, p> {
 
 class VOP3e_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p>;
 
+class VOP3e_t16_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3a_t16<p> {
+  let Inst{25-16} = op;
+}
+
 class VOP3e_vi <bits<10> op, VOPProfile P> : VOP3a_vi <op, P> {
   bits<8> vdst;
   let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
@@ -736,7 +776,12 @@ class VOP3_DPPe_Fields : VOP3_DPPe_Fields_Base {
   bits<8> src0;
 }
 
+class VOP3_DPPe_Fields_t16 : VOP3_DPPe_Fields_Base {
+  bits<11> src0;
+}
+
 // Common refers to common between DPP and DPP8
+// Base refers to a shared base between T16 and regular instructions
 class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
   bits<4> src0_modifiers;
   bits<3> src1_modifiers;
@@ -748,7 +793,7 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
   let Inst{8}     = !if(P.HasSrc0Mods, src0_modifiers{1}, 0);
   let Inst{9}     = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
   let Inst{10}    = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
-  // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
+  // 16-bit select fields which can be interpreted as OpSel or hi/lo suffix
   let Inst{11} = !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{2}, 0),
                                  !if(P.IsFP8SrcByteSel, byte_sel{1}, ?));
   let Inst{12} = !if(P.HasOpSel, !if(P.HasSrc1Mods, src1_modifiers{2}, 0),
@@ -777,6 +822,16 @@ class VOP3_DPPe_Common<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op, P>
   let Inst{58-50} = !if(P.HasSrc2, src2, 0);
 }
 
+class VOP3_DPPe_Common_t16<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op, P> {
+  bits<11> vdst;
+  bits<11> src1;
+  bits<11> src2;
+
+  let Inst{7-0}   = !if(P.EmitDst, vdst{7-0}, 0);
+  let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
+  let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0);
+}
+
 class VOP3P_DPPe_Common_Base<bits<7> op, VOPProfile P> : Enc96 {
   bits<4> src0_modifiers;
   bits<4> src1_modifiers;
@@ -786,6 +841,7 @@ class VOP3P_DPPe_Common_Base<bits<7> op, VOPProfile P> : Enc96 {
   let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
   let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
   let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2
+  // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
   let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
   let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
   let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
@@ -810,6 +866,16 @@ class VOP3P_DPPe_Common<bits<7> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P
   let Inst{58-50} = !if(P.HasSrc2, src2, 0);
 }
 
+class VOP3P_DPPe_Common_t16<bits<7> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P> {
+  bits<11> vdst;
+  bits<11> src1;
+  bits<11> src2;
+
+  let Inst{7-0} = vdst{7-0};
+  let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
+  let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0);
+}
+
 class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
   dag Ins = P.InsDPP, string asmOps = P.AsmDPP> :
   VOP_Pseudo<OpName, "_dpp", P, P.OutsDPP, Ins, asmOps, pattern> {
@@ -870,6 +936,7 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
   // Copy relevant pseudo op flags
   let isConvergent         = ps.isConvergent;
   let SubtargetPredicate   = ps.SubtargetPredicate;
+  let True16Predicate      = ps.True16Predicate;
   let AssemblerPredicate   = ps.AssemblerPredicate;
   let OtherPredicates      = ps.OtherPredicates;
   let AsmMatchConverter    = ps.AsmMatchConverter;
@@ -928,11 +995,29 @@ class VOP3_DPP_Base <string OpName, VOPProfile P, bit IsDPP16,
   let Size = 12;
 }
 
+class VOP3_DPP_Enc <bits<10> op, VOPProfile P, bit IsDPP16> :
+  VOP3_DPPe_Common<op, P>,
+  VOP3_DPPe_Fields {
+
+  let Inst{40-32} = 0xfa;
+  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{80-72} = dpp_ctrl;
+  let Inst{82}    = !if(IsDPP16, fi, ?);
+  let Inst{83}    = bound_ctrl;
+
+  // Inst{87-84} ignored by hw
+  let Inst{91-88} = bank_mask;
+  let Inst{95-92} = row_mask;
+}
+
 class VOP3_DPP <bits<10> op, string OpName, VOPProfile P, bit IsDPP16,
                dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP),
                string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> :
-  VOP3_DPP_Base<OpName, P, IsDPP16, InsDPP, AsmDPP>, VOP3_DPPe_Common<op, P>,
-  VOP3_DPPe_Fields {
+  VOP3_DPP_Base<OpName, P, IsDPP16, InsDPP, AsmDPP>, VOP3_DPP_Enc<op, P, IsDPP16>;
+
+class VOP3_DPP_Enc_t16<bits<10> op, VOPProfile P, bit IsDPP16 >
+    : VOP3_DPPe_Common_t16<op, P>,
+      VOP3_DPPe_Fields_t16 {
 
   let Inst{40-32} = 0xfa;
   let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
@@ -945,6 +1030,13 @@ class VOP3_DPP <bits<10> op, string OpName, VOPProfile P, bit IsDPP16,
   let Inst{95-92} = row_mask;
 }
 
+class VOP3_DPP_t16<bits<10> op, string OpName, VOPProfile P, bit IsDPP16,
+                   dag InsDPP = !if (IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP),
+                   string AsmDPP = !if (IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)>
+    : VOP3_DPP_Base<OpName, P, IsDPP16, InsDPP, AsmDPP>,
+      VOP3_DPP_Enc_t16<op, P, IsDPP16> {
+}
+
 class VOP3P_DPP <bits<7> op, string OpName, VOPProfile P, bit IsDPP16,
                dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP),
                string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> :
@@ -979,6 +1071,12 @@ class VOP3_DPP8e_Fields {
   bits<9> fi;
 }
 
+class VOP3_DPP8e_Fields_t16 {
+  bits<11> src0;
+  bits<24> dpp8;
+  bits<9> fi;
+}
+
 class VOP_DPP8_Base<string OpName, VOPProfile P, dag InsDPP8 = P.InsDPP8, string AsmDPP8 = P.AsmDPP8> :
   InstSI<P.OutsDPP8, InsDPP8, OpName#AsmDPP8, []> {
 
@@ -1011,16 +1109,28 @@ class VOP3_DPP8_Base<string OpName, VOPProfile P> :
   let Size = 12;
 }
 
+class VOP3_DPP8_Enc <bits<10> op, VOPProfile P> :
+  VOP3_DPPe_Common<op, P>,
+  VOP3_DPP8e_Fields {
+  let Inst{40-32} = fi;
+  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{95-72} = dpp8{23-0};
+}
 
 class VOP3_DPP8<bits<10> op, string OpName, VOPProfile P> :
-  VOP3_DPP8_Base<OpName, P>, VOP3_DPPe_Common<op, P>,
-  VOP3_DPP8e_Fields {
+  VOP3_DPP8_Base<OpName, P>, VOP3_DPP8_Enc<op, P>;
 
+class VOP3_DPP8_Enc_t16 <bits<10> op, VOPProfile P> :
+  VOP3_DPPe_Common_t16<op, P>,
+  VOP3_DPP8e_Fields_t16 {
   let Inst{40-32} = fi;
   let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
   let Inst{95-72} = dpp8{23-0};
 }
 
+class VOP3_DPP8_t16<bits<10> op, string OpName, VOPProfile P> :
+  VOP3_DPP8_Base<OpName, P>, VOP3_DPP8_Enc_t16<op, P>;
+
 class VOP3P_DPP8<bits<7> op, string OpName, VOPProfile P> :
   VOP3_DPP8_Base<OpName, P>, VOP3P_DPPe_Common<op, P>,
   VOP3_DPP8e_Fields {
@@ -1273,6 +1383,30 @@ class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Pr
 
 }
 
+class VOP3_Profile_True16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile_True16<P> {
+  let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
+  let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
+  let IsMAI    = !if(Features.IsMAI,    1, P.IsMAI);
+  let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
+
+  let HasModifiers =
+      !if (Features.IsMAI, 0,
+           !or(Features.IsPacked, Features.HasOpSel, P.HasModifiers));
+  let IsSingle = 1;
+}
+
+class VOP3_Profile_Fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile_Fake16<P> {
+  let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
+  let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
+  let IsMAI    = !if(Features.IsMAI,    1, P.IsMAI);
+  let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
+
+  let HasModifiers =
+      !if (Features.IsMAI, 0,
+           !or(Features.IsPacked, Features.HasOpSel, P.HasModifiers));
+  let IsSingle = 1;
+}
+
 // consistently gives instructions a _e64 suffix
 multiclass VOP3Inst_Pseudo_Wrapper<string opName, VOPProfile P, list<dag> pattern = [], bit VOP3Only = 0> {
     def _e64 : VOP3_Pseudo<opName, P, pattern, VOP3Only>;
@@ -1325,11 +1459,33 @@ multiclass VOP3PseudoScalarInst<string OpName, VOPProfile P,
                                i32:$omod))))]>;
 }
 
+multiclass VOP3Inst_t16_with_profiles<string OpName, VOPProfile P, VOPProfile P_t16,
+                        VOPProfile P_fake16,
+                        SDPatternOperator node = null_frag,
+                        SDPatternOperator node_t16 = node> {
+  let True16Predicate = NotHasTrue16BitInsts  in {
+    defm NAME : VOP3Inst<OpName, P, node>;
+  }
+  let True16Predicate = UseRealTrue16Insts in {
+    defm _t16 : VOP3Inst<OpName # "_t16", P_t16, node_t16>;
+  }
+  let True16Predicate = UseFakeTrue16Insts in {
+    defm _fake16 : VOP3Inst<OpName # "_fake16", P_fake16, node>;
+  }
+}
+
+multiclass VOP3Inst_t16<string OpName, VOPProfile P,
+                        SDPatternOperator node = null_frag,
+                        SDPatternOperator node_t16 = node>
+ : VOP3Inst_t16_with_profiles<OpName, VOP3_Profile<P, VOP3_OPSEL>,
+                VOP3_Profile_True16<P, VOP3_OPSEL>, VOP3_Profile_Fake16<P, VOP3_OPSEL>,
+                node, node_t16>;
+
 //===----------------------------------------------------------------------===//
 // VOP3 DPP
 //===----------------------------------------------------------------------===//
 
-class Base_VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
+class VOP3_DPP16_Helper<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
     : VOP3_DPP<op, opName, ps.Pfl, 1> {
   let VOP3_OPSEL = ps.Pfl.HasOpSel;
   let IsDOT = ps.IsDOT;
@@ -1342,17 +1498,43 @@ class Base_VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
   let OtherPredicates = ps.OtherPredicates;
 }
 
+class VOP3_DPP16_t16_Helper<bits<10> op, VOP_DPP_Pseudo ps,
+                          string opName = ps.OpName>
+    : VOP3_DPP_t16<op, opName, ps.Pfl, 1> {
+  let VOP3_OPSEL = ps.Pfl.HasOpSel;
+  let IsDOT = ps.IsDOT;
+  let hasSideEffects = ps.hasSideEffects;
+  let Defs = ps.Defs;
+  let SchedRW = ps.SchedRW;
+  let Uses = ps.Uses;
+  let AssemblerPredicate = HasDPP16;
+  let SubtargetPredicate = HasDPP16;
+  let OtherPredicates = ps.OtherPredicates;
+}
+
 class VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, int subtarget,
                  string opName = ps.OpName>
-    : Base_VOP3_DPP16<op, ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>;
+    : VOP3_DPP16_Helper<op, ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>;
+
+class VOP3_DPP16_t16<bits<10> op, VOP_DPP_Pseudo ps, int subtarget,
+                     string opName = ps.OpName>
+    : VOP3_DPP16_t16_Helper<op, ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>;
 
 class VOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo ps, GFXGen Gen,
-                     string opName = ps.OpName> :
-  VOP3_DPP16 <op, ps, Gen.Subtarget, opName> {
+                     string opName = ps.OpName>
+    : VOP3_DPP16<op, ps, Gen.Subtarget, opName> {
   let AssemblerPredicate = Gen.AssemblerPredicate;
-  let True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate);
-  let DecoderNamespace = Gen.DecoderNamespace#
-                         !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
+  let DecoderNamespace = Gen.DecoderNamespace;
+}
+
+class VOP3_DPP16_Gen_t16<bits<10> op, VOP_DPP_Pseudo ps, GFXGen Gen,
+                         string opName = ps.OpName>
+    : VOP3_DPP16_t16<op, ps, Gen.Subtarget, opName> {
+  let True16Predicate =
+      !if (ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate);
+  let AssemblerPredicate = Gen.AssemblerPredicate;
+  let DecoderNamespace =
+      Gen.DecoderNamespace #!if (ps.Pfl.IsRealTrue16, "", "_FAKE16");
 }
 
 class Base_VOP3_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
@@ -1366,11 +1548,25 @@ class Base_VOP3_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
 
   let SubtargetPredicate = ps.SubtargetPredicate;
   let OtherPredicates = ps.OtherPredicates;
+  let True16Predicate = ps.True16Predicate;
+}
+
+class Base_VOP3_DPP8_t16<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
+    : VOP3_DPP8_t16<op, opName, ps.Pfl> {
+  let VOP3_OPSEL = ps.Pfl.HasOpSel;
+  let IsDOT = ps.IsDOT;
+  let hasSideEffects = ps.hasSideEffects;
+  let Defs = ps.Defs;
+  let SchedRW = ps.SchedRW;
+  let Uses = ps.Uses;
+
+  let OtherPredicates = ps.OtherPredicates;
+  let True16Predicate = ps.True16Predicate;
 }
 
 class Base_VOP3b_DPP16<bits<10> op, VOP_DPP_Pseudo ps,
                        string opName = ps.OpName>
-    : Base_VOP3_DPP16<op, ps, opName> {
+    : VOP3_DPP16_Helper<op, ps, opName> {
   bits<7> sdst;
   let Inst{14 - 8} = sdst;
 }
@@ -1381,6 +1577,12 @@ class VOP3b_DPP8_Base<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
   let Inst{14 - 8} = sdst;
 }
 
+class VOP3b_DPP8_Base_t16<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
+    : Base_VOP3_DPP8<op, ps, opName> {
+  bits<8> sdst;
+  let Inst{14 - 8} = sdst{7-1};
+}
+
 //===----------------------------------------------------------------------===//
 // VOP3 GFX11, GFX12
 //===----------------------------------------------------------------------===//
@@ -1420,10 +1622,11 @@ multiclass VOP3Dot_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME,
 }
 
 multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
-                               string asmName, bit isSingle = 0> {
+                               string asmName, string pseudo_mnemonic = "", bit isSingle = 0> {
   defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
   let AsmString = asmName # ps.AsmOperands,
       IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
+    // FIXME-TRUE16 support FP8 instructions properly
     if ps.Pfl.IsFP8SrcByteSel then {
       def _e64#Gen.Suffix :
         VOP3_Real_Gen<ps, Gen>,
@@ -1432,17 +1635,27 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
       def _e64#Gen.Suffix :
         VOP3_Real_Gen<ps, Gen>,
         VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<op, ps.Pfl>;
-    } else if ps.Pfl.HasOpSel then {
-      def _e64#Gen.Suffix :
-        VOP3_Real_Gen<ps, Gen>,
-        VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
     } else {
-      def _e64#Gen.Suffix :
-        VOP3_Real_Gen<ps, Gen>,
-        VOP3e_gfx11_gfx12<op, ps.Pfl>;
+      if ps.Pfl.IsRealTrue16 then {
+        def _e64#Gen.Suffix :
+          VOP3_Real_Gen<ps, Gen>,
+          VOP3e_t16_gfx11_gfx12<op, ps.Pfl>;
+      } else {
+        if ps.Pfl.HasOpSel then {
+          def _e64#Gen.Suffix :
+            VOP3_Real_Gen<ps, Gen>,
+            VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
+        } else {
+          def _e64#Gen.Suffix :
+            VOP3_Real_Gen<ps, Gen>,
+            VOP3e_gfx11_gfx12<op, ps.Pfl>;
+        }
+      }
     }
   }
-  def Gen.Suffix#"_VOP3_alias" : LetDummies, AMDGPUMnemonicAlias<ps.Mnemonic, asmName> {
+  def Gen.Suffix#"_VOP3_alias" : LetDummies,
+                                 AMDGPUMnemonicAlias<!if(!empty(pseudo_mnemonic),
+                                                     ps.Mnemonic, pseudo_mnemonic), asmName, ""> {
     let AssemblerPredicate = Gen.AssemblerPredicate;
   }
 }
@@ -1456,8 +1669,13 @@ multiclass VOP3_Real_No_Suffix<GFXGen Gen, bits<10> op, string opName = NAME> {
 }
 
 multiclass VOP3_Real_dpp_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
-  def _e64_dpp#Gen.Suffix :
-    VOP3_DPP16_Gen<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), Gen>;
+  defvar ps = !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp");
+  if ps.Pfl.IsTrue16 then
+    def _e64_dpp#Gen.Suffix :
+      VOP3_DPP16_Gen_t16<op, ps, Gen>;
+  else
+    def _e64_dpp#Gen.Suffix :
+      VOP3_DPP16_Gen<op, ps, Gen>;
 }
 
 multiclass VOP3Dot_Real_dpp_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
@@ -1552,18 +1770,14 @@ multiclass VOP3Only_Realtriple<GFXGen Gen, bits<10> op> :
   VOP3_Realtriple<Gen, op, 1>;
 
 multiclass VOP3_Realtriple_with_name<GFXGen Gen, bits<10> op, string opName,
-                                     string asmName, bit isSingle = 0> :
-  VOP3_Real_with_name<Gen, op, opName, asmName, isSingle>,
+                                     string asmName, string pseudo_mnemonic = "", bit isSingle = 0> :
+  VOP3_Real_with_name<Gen, op, opName, asmName, pseudo_mnemonic, isSingle>,
   VOP3_Real_dpp_with_name<Gen, op, opName, asmName>,
   VOP3_Real_dpp8_with_name<Gen, op, opName, asmName>;
 
 multiclass VOP3Only_Realtriple_with_name<GFXGen Gen, bits<10> op, string opName,
                                          string asmName> :
-  VOP3_Realtriple_with_name<Gen, op, opName, asmName, 1>;
-
-multiclass VOP3Only_Realtriple_t16<GFXGen Gen, bits<10> op, string asmName,
-                                   string opName = NAME>
-    : VOP3Only_Realtriple_with_name<Gen, op, opName, asmName>;
+  VOP3_Realtriple_with_name<Gen, op, opName, asmName, "", 1>;
 
 multiclass VOP3be_Realtriple<
     GFXGen Gen, bits<10> op, bit isSingle = 0, string opName = NAME,
@@ -1579,6 +1793,16 @@ multiclass VOP3beOnly_Realtriple<GFXGen Gen, bits<10> op> :
 // VOP3 GFX11
 //===----------------------------------------------------------------------===//
 
+// VOP1 and VOP2 depend on these triple defs
+
+multiclass VOP3_Realtriple_t16_gfx11<bits<10> op, string asmName, string opName = NAME,
+                                     string pseudo_mnemonic = "", bit isSingle = 0> :
+  VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
+
+multiclass VOP3Only_Realtriple_t16_gfx11<bits<10> op, string asmName,
+                                     string opName = NAME, string pseudo_mnemonic = "">
+  : VOP3_Realtriple_t16_gfx11<op, asmName, opName, pseudo_mnemonic, 1>;
+
 multiclass VOP3be_Real_gfx11<bits<10> op, string opName, string asmName,
                              bit isSingle = 0> :
   VOP3be_Real<GFX11Gen, op, opName, asmName, isSingle>;
@@ -1591,10 +1815,6 @@ multiclass VOP3_Realtriple_gfx11<bits<10> op, bit isSingle = 0,
                                  string opName = NAME> :
   VOP3_Realtriple<GFX11Gen, op, isSingle, opName>;
 
-multiclass VOP3Only_Realtriple_t16_gfx11<bits<10> op, string asmName,
-                                     string opName = NAME>
-    : VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>;
-
 //===----------------------------------------------------------------------===//
 // VOP3 GFX12
 //===----------------------------------------------------------------------===//
@@ -1610,6 +1830,16 @@ multiclass VOP3Only_Real_Base_gfx12<bits<10> op> :
 multiclass VOP3Only_Realtriple_t16_gfx12<bits<10> op> :
   VOP3Only_Realtriple<GFX12Gen, op>;
 
+multiclass VOP3_Realtriple_t16_gfx12<bits<10> op, string asmName, string opName = NAME,
+                                     string pseudo_mnemonic = "", bit isSingle = 0> :
+  VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
+
+multiclass VOP3_Realtriple_t16_and_fake16_gfx12<bits<10> op, string asmName, string opName = NAME,
+                                                string pseudo_mnemonic = "", bit isSingle = 0> {
+  defm opName#"_t16":VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_t16", pseudo_mnemonic, isSingle>;
+  defm opName#"_fake16":VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
+}
+
 multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName,
                                        string asmName, bit isSingle = 0> {
   defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
@@ -1624,18 +1854,14 @@ multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName,
 }
 
 multiclass VOP3_Realtriple_with_name_gfx12<bits<10> op, string opName,
-                                           string asmName, bit isSingle = 0> :
-  VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, isSingle>;
+                                           string asmName, string pseudo_mnemonic = "", bit isSingle = 0> :
+  VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
 
 multiclass VOP3Only_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
                                                      string asmName> :
   VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>,
   VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>;
 
-multiclass VOP3Only_Realtriple_with_name_t16_gfx12<bits<10> op, string asmName,
-                                     string opName = NAME>
-    : VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>;
-
 //===----------------------------------------------------------------------===//
 
 include "VOPCInstructions.td"
@@ -1705,4 +1931,4 @@ def VOPTrue16Table : GenericTable {
 
   let PrimaryKey = ["Opcode"];
   let PrimaryKeyName = "getTrue16OpcodeHelper";
-}
\ No newline at end of file
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
index 11411c691c3901..1971cd80d5686b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
@@ -2,7 +2,7 @@
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX6 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX6 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s
 
 # Note: 16-bit instructions generally produce a 0 result in the high 16-bits on GFX8 and GFX9 and preserve high 16 bits on GFX10+
 
@@ -23,6 +23,7 @@ body: |
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6-NEXT: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec
     ; GFX6-NEXT: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
+    ;
     ; GFX10-LABEL: name: add_s16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -30,6 +31,14 @@ body: |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX10-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
     ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ADD_NC_U16_e64_]]
+    ;
+    ; GFX11-LABEL: name: add_s16
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_ADD_NC_U16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -56,6 +65,7 @@ body: |
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6-NEXT: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec
     ; GFX6-NEXT: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
+    ;
     ; GFX10-LABEL: name: add_s16_zext_to_s32
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -65,6 +75,16 @@ body: |
     ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
     ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ADD_NC_U16_e64_]], implicit $exec
     ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+    ;
+    ; GFX11-LABEL: name: add_s16_zext_to_s32
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+    ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ADD_NC_U16_fake16_e64_]], implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -91,12 +111,20 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec
     ; GFX6-NEXT: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
+    ;
     ; GFX10-LABEL: name: add_s16_neg_inline_const_64
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX10-NEXT: [[V_SUB_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec
     ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_SUB_NC_U16_e64_]]
+    ;
+    ; GFX11-LABEL: name: add_s16_neg_inline_const_64
+    ; GFX11: liveins: $vgpr0
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[V_SUB_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_fake16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_SUB_NC_U16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s16) = G_CONSTANT i16 -64
@@ -121,6 +149,7 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec
     ; GFX6-NEXT: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
+    ;
     ; GFX10-LABEL: name: add_s16_neg_inline_const_64_zext_to_s32
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -129,6 +158,15 @@ body: |
     ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
     ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_SUB_NC_U16_e64_]], implicit $exec
     ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+    ;
+    ; GFX11-LABEL: name: add_s16_neg_inline_const_64_zext_to_s32
+    ; GFX11: liveins: $vgpr0
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[V_SUB_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_fake16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec
+    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+    ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_SUB_NC_U16_fake16_e64_]], implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s16) = G_CONSTANT i16 -64
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
index 1151bde02ef62c..41b61f2e09a3d3 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
@@ -82,9 +82,9 @@ body:             |
 
 # Regression test for src_modifiers on base u16 opcode
 # GCN-LABEL: name: vop3_u16
-# GCN: %5:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec
-# GCN: %7:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 1, %5, 2, %5, 0, 0, 1, 15, 15, 1, implicit $exec
-# GCN: %9:vgpr_32 = V_ADD_NC_U16_e64 4, %8, 8, %7, 0, 0, implicit $exec
+# GCN: %5:vgpr_32 = V_ADD_NC_U16_fake16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec
+# GCN: %7:vgpr_32 = V_ADD_NC_U16_fake16_e64_dpp %3, 1, %5, 2, %5, 0, 0, 1, 15, 15, 1, implicit $exec
+# GCN: %9:vgpr_32 = V_ADD_NC_U16_fake16_e64 4, %8, 8, %7, 0, 0, implicit $exec
 name: vop3_u16
 tracksRegLiveness: true
 body:             |
@@ -96,11 +96,11 @@ body:             |
     %2:vgpr_32 = COPY $vgpr2
     %3:vgpr_32 = IMPLICIT_DEF
     %4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
-    %5:vgpr_32 = V_ADD_NC_U16_e64 0, %4, 0, %3, 0, 0, implicit $exec
+    %5:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, %4, 0, %3, 0, 0, implicit $exec
     %6:vgpr_32 = V_MOV_B32_dpp %3, %5, 1, 15, 15, 1, implicit $exec
-    %7:vgpr_32 = V_ADD_NC_U16_e64 1, %6, 2, %5, 0, 0, implicit $exec
+    %7:vgpr_32 = V_ADD_NC_U16_fake16_e64 1, %6, 2, %5, 0, 0, implicit $exec
     %8:vgpr_32 = V_MOV_B32_dpp %3, %7, 1, 15, 15, 1, implicit $exec
-    %9:vgpr_32 = V_ADD_NC_U16_e64 4, %8, 8, %7, 0, 0, implicit $exec
+    %9:vgpr_32 = V_ADD_NC_U16_fake16_e64 4, %8, 8, %7, 0, 0, implicit $exec
 ...
 
 name: vop3p
@@ -880,11 +880,11 @@ body: |
 
 # Check op_sel is all 0s when combining
 # GCN-LABEL: name: opsel_vop3
-# GCN: %4:vgpr_32 = V_ADD_I16_e64_dpp %2, 0, %0, 0, %1, 0, 0, 1, 15, 15, 1, implicit $exec
-# GCN: %6:vgpr_32 = V_ADD_I16_e64 4, %5, 0, %1, 0, 0, implicit $exec
-# GCN: %8:vgpr_32 = V_ADD_I16_e64 0, %7, 4, %1, 0, 0, implicit $exec
-# GCN: %10:vgpr_32 = V_ADD_I16_e64 4, %9, 4, %1, 0, 0, implicit $exec
-# GCN: %12:vgpr_32 = V_ADD_I16_e64 8, %11, 0, %1, 0, 0, implicit $exec
+# GCN: %4:vgpr_32 = V_ADD_I16_fake16_e64_dpp %2, 0, %0, 0, %1, 0, 0, 1, 15, 15, 1, implicit $exec
+# GCN: %6:vgpr_32 = V_ADD_I16_fake16_e64 4, %5, 0, %1, 0, 0, implicit $exec
+# GCN: %8:vgpr_32 = V_ADD_I16_fake16_e64 0, %7, 4, %1, 0, 0, implicit $exec
+# GCN: %10:vgpr_32 = V_ADD_I16_fake16_e64 4, %9, 4, %1, 0, 0, implicit $exec
+# GCN: %12:vgpr_32 = V_ADD_I16_fake16_e64 8, %11, 0, %1, 0, 0, implicit $exec
 name:            opsel_vop3
 tracksRegLiveness: true
 body:             |
@@ -897,23 +897,23 @@ body:             |
 
     ; Combine for op_sel:[0,0,0]
     %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
-    %4:vgpr_32 = V_ADD_I16_e64 0, %3, 0, %1, 0, 0, implicit $exec
+    %4:vgpr_32 = V_ADD_I16_fake16_e64 0, %3, 0, %1, 0, 0, implicit $exec
 
     ; Do not combine for op_sel:[1,0,0]
     %5:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
-    %6:vgpr_32 = V_ADD_I16_e64 4, %5, 0, %1, 0, 0, implicit $exec
+    %6:vgpr_32 = V_ADD_I16_fake16_e64 4, %5, 0, %1, 0, 0, implicit $exec
 
     ; Do not combine for op_sel:[0,1,0]
     %7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
-    %8:vgpr_32 = V_ADD_I16_e64 0, %7, 4, %1, 0, 0, implicit $exec
+    %8:vgpr_32 = V_ADD_I16_fake16_e64 0, %7, 4, %1, 0, 0, implicit $exec
 
     ; Do not combine for op_sel:[1,1,0]
     %9:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
-    %10:vgpr_32 = V_ADD_I16_e64 4, %9, 4, %1, 0, 0, implicit $exec
+    %10:vgpr_32 = V_ADD_I16_fake16_e64 4, %9, 4, %1, 0, 0, implicit $exec
 
     ; Do not combine for op_sel:[0,0,1] (dst_op_sel only)
     %11:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
-    %12:vgpr_32 = V_ADD_I16_e64 8, %11, 0, %1, 0, 0, implicit $exec
+    %12:vgpr_32 = V_ADD_I16_fake16_e64 8, %11, 0, %1, 0, 0, implicit $exec
 ...
 
 # Check op_sel is all 0s and op_sel_hi is all 1s when combining
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index c62b4e565078e4..2e2a1094ba99ae 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -996,7 +996,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
   ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; GISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GISEL-GFX11-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $exec
+  ; GISEL-GFX11-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $exec
   ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; GISEL-GFX11-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
   ; GISEL-GFX11-NEXT:   S_ENDPGM 0
@@ -1020,7 +1020,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
   ; DAGISEL-GFX11-WF32-NEXT: {{  $}}
   ; DAGISEL-GFX11-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX11-WF32-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
   ; DAGISEL-GFX11-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX11-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
@@ -1032,7 +1032,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
   ; DAGISEL-GFX11-WF64-NEXT: {{  $}}
   ; DAGISEL-GFX11-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX11-WF64-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
   ; DAGISEL-GFX11-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX11-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
index 67038f4c8eec09..210d55898367d8 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
@@ -244,49 +244,67 @@ v_add_lshl_u32 v5, src_scc, vcc_lo, -1
 v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null
 // GFX11: encoding: [0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 
-v_add_nc_i16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
+v_add_nc_i16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
 
-v_add_nc_i16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
+v_add_nc_i16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
 
-v_add_nc_i16 v5, s1, s2
+v_add_nc_i16 v5.l, s1, s2
 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
 
-v_add_nc_i16 v5, s105, s105
+v_add_nc_i16 v5.l, s105, s105
 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
 
-v_add_nc_i16 v5, vcc_lo, ttmp15
+v_add_nc_i16 v5.l, vcc_lo, ttmp15
 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_add_nc_i16 v5, vcc_hi, 0xfe0b
+v_add_nc_i16 v5.l, vcc_hi, 0xfe0b
 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_add_nc_i16 v5, ttmp15, src_scc
+v_add_nc_i16 v5.l, ttmp15, src_scc
 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_add_nc_i16 v5, m0, 0.5
+v_add_nc_i16 v5.l, m0, 0.5
 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_add_nc_i16 v5, exec_lo, -1
+v_add_nc_i16 v5.l, exec_lo, -1
 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
 
-v_add_nc_i16 v5, exec_hi, null
+v_add_nc_i16 v5.l, exec_hi, null
 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_add_nc_i16 v5, null, exec_lo op_sel:[1,1,1]
+v_add_nc_i16 v5.l, null, exec_lo
+// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+
+v_add_nc_i16 v5.l, -1, exec_hi
+// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+
+v_add_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1]
 // GFX11: encoding: [0x05,0x58,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_add_nc_i16 v5, -1, exec_hi op_sel:[0,0,0]
+v_add_nc_i16 v5.l, -1, exec_hi op_sel:[0,0,0]
 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_add_nc_i16 v5, 0.5, m0 op_sel:[1,0,0]
+v_add_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0]
 // GFX11: encoding: [0x05,0x08,0x0d,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_add_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
 // GFX11: encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX11: encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_add_nc_i16 v5.l, src_scc, vcc_lo
+// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+
+v_add_nc_i16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_i16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+
+v_add_nc_i16 v255.h, 0xfe0b, vcc_hi clamp
 // GFX11: encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_add_nc_i32 v5, v1, v2
@@ -334,49 +352,67 @@ v_add_nc_i32 v5, src_scc, vcc_lo
 v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp
 // GFX11: encoding: [0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_add_nc_u16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
+v_add_nc_u16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
 
-v_add_nc_u16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
+v_add_nc_u16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
 
-v_add_nc_u16 v5, s1, s2
+v_add_nc_u16 v5.l, s1, s2
 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
 
-v_add_nc_u16 v5, s105, s105
+v_add_nc_u16 v5.l, s105, s105
 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
 
-v_add_nc_u16 v5, vcc_lo, ttmp15
+v_add_nc_u16 v5.l, vcc_lo, ttmp15
 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_add_nc_u16 v5, vcc_hi, 0xfe0b
+v_add_nc_u16 v5.l, vcc_hi, 0xfe0b
 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_add_nc_u16 v5, ttmp15, src_scc
+v_add_nc_u16 v5.l, ttmp15, src_scc
 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_add_nc_u16 v5, m0, 0.5
+v_add_nc_u16 v5.l, m0, 0.5
 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_add_nc_u16 v5, exec_lo, -1
+v_add_nc_u16 v5.l, exec_lo, -1
 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
 
-v_add_nc_u16 v5, exec_hi, null
+v_add_nc_u16 v5.l, exec_hi, null
 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_add_nc_u16 v5, null, exec_lo op_sel:[1,1,1]
+v_add_nc_u16 v5.l, null, exec_lo
+// GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+
+v_add_nc_u16 v5.l, -1, exec_hi
+// GFX11: encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+
+v_add_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1]
 // GFX11: encoding: [0x05,0x58,0x03,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_add_nc_u16 v5, -1, exec_hi op_sel:[0,0,0]
+v_add_nc_u16 v5.l, -1, exec_hi op_sel:[0,0,0]
 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_add_nc_u16 v5, 0.5, m0 op_sel:[1,0,0]
+v_add_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0]
 // GFX11: encoding: [0x05,0x08,0x03,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_add_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
 // GFX11: encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX11: encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_add_nc_u16 v5.l, src_scc, vcc_lo
+// GFX11: encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+
+v_add_nc_u16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_u16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+
+v_add_nc_u16 v255.h, 0xfe0b, vcc_hi clamp
 // GFX11: encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_alignbit_b32 v5, v1, v2, s3
@@ -5801,49 +5837,67 @@ v_sub_co_u32 v5, ttmp[14:15], src_scc, vcc_lo
 v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp
 // GFX11: encoding: [0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_sub_nc_i16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
+v_sub_nc_i16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
 
-v_sub_nc_i16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
+v_sub_nc_i16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
 
-v_sub_nc_i16 v5, s1, s2
+v_sub_nc_i16 v5.l, s1, s2
 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
 
-v_sub_nc_i16 v5, s105, s105
+v_sub_nc_i16 v5.l, s105, s105
 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
 
-v_sub_nc_i16 v5, vcc_lo, ttmp15
+v_sub_nc_i16 v5.l, vcc_lo, ttmp15
 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_sub_nc_i16 v5, vcc_hi, 0xfe0b
+v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b
 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_sub_nc_i16 v5, ttmp15, src_scc
+v_sub_nc_i16 v5.l, ttmp15, src_scc
 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_sub_nc_i16 v5, m0, 0.5
+v_sub_nc_i16 v5.l, m0, 0.5
 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_sub_nc_i16 v5, exec_lo, -1
+v_sub_nc_i16 v5.l, exec_lo, -1
 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
 
-v_sub_nc_i16 v5, exec_hi, null
+v_sub_nc_i16 v5.l, exec_hi, null
 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_sub_nc_i16 v5, null, exec_lo op_sel:[1,1,1]
+v_sub_nc_i16 v5.l, null, exec_lo
+// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+
+v_sub_nc_i16 v5.l, -1, exec_hi
+// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+
+v_sub_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1]
 // GFX11: encoding: [0x05,0x58,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_sub_nc_i16 v5, -1, exec_hi op_sel:[0,0,0]
+v_sub_nc_i16 v5.l, -1, exec_hi op_sel:[0,0,0]
 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_sub_nc_i16 v5, 0.5, m0 op_sel:[1,0,0]
+v_sub_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0]
 // GFX11: encoding: [0x05,0x08,0x0e,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_sub_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
 // GFX11: encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX11: encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sub_nc_i16 v5.l, src_scc, vcc_lo
+// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+
+v_sub_nc_i16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_i16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+
+v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi clamp
 // GFX11: encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_sub_nc_i32 v5, v1, v2
@@ -5891,49 +5945,67 @@ v_sub_nc_i32 v5, src_scc, vcc_lo
 v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp
 // GFX11: encoding: [0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_sub_nc_u16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
+v_sub_nc_u16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
 
-v_sub_nc_u16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
+v_sub_nc_u16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
 
-v_sub_nc_u16 v5, s1, s2
+v_sub_nc_u16 v5.l, s1, s2
 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
 
-v_sub_nc_u16 v5, s105, s105
+v_sub_nc_u16 v5.l, s105, s105
 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
 
-v_sub_nc_u16 v5, vcc_lo, ttmp15
+v_sub_nc_u16 v5.l, vcc_lo, ttmp15
 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_sub_nc_u16 v5, vcc_hi, 0xfe0b
+v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b
 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_sub_nc_u16 v5, ttmp15, src_scc
+v_sub_nc_u16 v5.l, ttmp15, src_scc
 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_sub_nc_u16 v5, m0, 0.5
+v_sub_nc_u16 v5.l, m0, 0.5
 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_sub_nc_u16 v5, exec_lo, -1
+v_sub_nc_u16 v5.l, exec_lo, -1
 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
 
-v_sub_nc_u16 v5, exec_hi, null
+v_sub_nc_u16 v5.l, exec_hi, null
 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_sub_nc_u16 v5, null, exec_lo op_sel:[1,1,1]
+v_sub_nc_u16 v5.l, null, exec_lo
+// GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+
+v_sub_nc_u16 v5.l, -1, exec_hi
+// GFX11: encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+
+v_sub_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1]
 // GFX11: encoding: [0x05,0x58,0x04,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_sub_nc_u16 v5, -1, exec_hi op_sel:[0,0,0]
+v_sub_nc_u16 v5.l, -1, exec_hi op_sel:[0,0,0]
 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_sub_nc_u16 v5, 0.5, m0 op_sel:[1,0,0]
+v_sub_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0]
 // GFX11: encoding: [0x05,0x08,0x04,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_sub_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
 // GFX11: encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX11: encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sub_nc_u16 v5.l, src_scc, vcc_lo
+// GFX11: encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+
+v_sub_nc_u16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_u16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+
+v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi clamp
 // GFX11: encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_subrev_co_u32 v5, s6, v1, v2
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
index 3c693c556194e1..c82b61e21edf64 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
@@ -194,47 +194,47 @@ v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bo
 v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0x47,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_add_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: [0xff,0x80,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_add_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x26,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -278,47 +278,47 @@ v_add_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
 v_add_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x80,0x26,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_add_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: [0xff,0x80,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -4116,47 +4116,47 @@ v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:
 v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0xfc,0x01,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_sub_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: [0xff,0x80,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_sub_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -4200,47 +4200,47 @@ v_sub_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
 v_sub_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x80,0x25,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_sub_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: [0xff,0x80,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_subrev_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x06,0x02,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -4475,30 +4475,6 @@ v_xor_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_xor_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX11: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX11: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX11: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX11: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
 v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
 // GFX11: [0x05,0x0a,0x12,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
@@ -4724,30 +4700,6 @@ v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 ban
 v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
 // GFX11: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX11: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX11: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX11: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX11: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
 v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
 // GFX11: encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
index 79709278bc0c7b..73369685a0e6dc 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
@@ -114,14 +114,23 @@ v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0x47,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: [0x05,0x00,0x0d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x0d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_add_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: [0xff,0x80,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc0,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x26,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -132,14 +141,23 @@ v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_add_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x80,0x26,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: [0x05,0x00,0x03,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x03,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_add_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: [0xff,0x80,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc0,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_alignbit_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x16,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -2601,14 +2619,23 @@ v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0xfc,0x01,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: [0x05,0x00,0x0e,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x0e,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_sub_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: [0xff,0x80,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc0,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x25,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -2619,14 +2646,23 @@ v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sub_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x80,0x25,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: [0x05,0x00,0x04,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x04,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_sub_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: [0xff,0x80,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc0,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_subrev_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x06,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -2748,30 +2784,6 @@ v_xor_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_xor_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0x64,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX11: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX11: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
 v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x0a,0x12,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
@@ -2997,30 +3009,6 @@ v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
 v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1
 // GFX11: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX11: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX11: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
 v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4]
 // GFX11: encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
index f28933ec3a8945..1ae1eaf1ceeadd 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
@@ -208,49 +208,58 @@ v_add_lshl_u32 v5, src_scc, vcc_lo, -1
 v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null
 // GFX12: encoding: [0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 
-v_add_nc_i16 v5, v1, v2
+v_add_nc_i16 v5.l, v1.l, v2.l
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
 
-v_add_nc_i16 v5, v255, v255
+v_add_nc_i16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_i16 v5.l, v255.l, v255.l
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
 
-v_add_nc_i16 v5, s1, s2
+v_add_nc_i16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+
+v_add_nc_i16 v5.l, s1, s2
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
 
-v_add_nc_i16 v5, s105, s105
+v_add_nc_i16 v5.l, s105, s105
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
 
-v_add_nc_i16 v5, vcc_lo, ttmp15
+v_add_nc_i16 v5.l, vcc_lo, ttmp15
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_add_nc_i16 v5, vcc_hi, 0xfe0b
+v_add_nc_i16 v5.l, vcc_hi, 0xfe0b
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_add_nc_i16 v5, ttmp15, src_scc
+v_add_nc_i16 v5.l, ttmp15, src_scc
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_add_nc_i16 v5, m0, 0.5
+v_add_nc_i16 v5.l, m0, 0.5
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_add_nc_i16 v5, exec_lo, -1
+v_add_nc_i16 v5.l, exec_lo, -1
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
 
-v_add_nc_i16 v5, exec_hi, null
+v_add_nc_i16 v5.l, exec_hi, null
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_add_nc_i16 v5, null, exec_lo op_sel:[1,1,1]
+v_add_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1]
 // GFX12: encoding: [0x05,0x58,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_add_nc_i16 v5, -1, exec_hi op_sel:[0,0,0]
+v_add_nc_i16 v5.l, -1, exec_hi op_sel:[0,0,0]
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_add_nc_i16 v5, 0.5, m0 op_sel:[1,0,0]
+v_add_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0]
 // GFX12: encoding: [0x05,0x08,0x0d,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_add_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
 // GFX12: encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX12: encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_add_nc_i16 v255.h, 0xfe0b, vcc_hi clamp
 // GFX12: encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_add_nc_i32 v5, v1, v2
@@ -298,49 +307,58 @@ v_add_nc_i32 v5, src_scc, vcc_lo
 v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp
 // GFX12: encoding: [0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_add_nc_u16 v5, v1, v2
+v_add_nc_u16 v5.l, v1.l, v2.l
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
 
-v_add_nc_u16 v5, v255, v255
+v_add_nc_u16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_u16 v5.l, v255.l, v255.l
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
 
-v_add_nc_u16 v5, s1, s2
+v_add_nc_u16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+
+v_add_nc_u16 v5.l, s1, s2
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
 
-v_add_nc_u16 v5, s105, s105
+v_add_nc_u16 v5.l, s105, s105
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
 
-v_add_nc_u16 v5, vcc_lo, ttmp15
+v_add_nc_u16 v5.l, vcc_lo, ttmp15
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_add_nc_u16 v5, vcc_hi, 0xfe0b
+v_add_nc_u16 v5.l, vcc_hi, 0xfe0b
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_add_nc_u16 v5, ttmp15, src_scc
+v_add_nc_u16 v5.l, ttmp15, src_scc
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_add_nc_u16 v5, m0, 0.5
+v_add_nc_u16 v5.l, m0, 0.5
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_add_nc_u16 v5, exec_lo, -1
+v_add_nc_u16 v5.l, exec_lo, -1
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
 
-v_add_nc_u16 v5, exec_hi, null
+v_add_nc_u16 v5.l, exec_hi, null
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_add_nc_u16 v5, null, exec_lo op_sel:[1,1,1]
+v_add_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1]
 // GFX12: encoding: [0x05,0x58,0x03,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_add_nc_u16 v5, -1, exec_hi op_sel:[0,0,0]
+v_add_nc_u16 v5.l, -1, exec_hi op_sel:[0,0,0]
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_add_nc_u16 v5, 0.5, m0 op_sel:[1,0,0]
+v_add_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0]
 // GFX12: encoding: [0x05,0x08,0x03,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_add_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
 // GFX12: encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX12: encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_add_nc_u16 v255.h, 0xfe0b, vcc_hi clamp
 // GFX12: encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_alignbit_b32 v5, v1, v2, s3
@@ -5696,49 +5714,58 @@ v_sub_co_u32 v5, ttmp[14:15], src_scc, vcc_lo
 v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp
 // GFX12: encoding: [0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_sub_nc_i16 v5, v1, v2
+v_sub_nc_i16 v5.l, v1.l, v2.l
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
 
-v_sub_nc_i16 v5, v255, v255
+v_sub_nc_i16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_i16 v5.l, v255.l, v255.l
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
 
-v_sub_nc_i16 v5, s1, s2
+v_sub_nc_i16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+
+v_sub_nc_i16 v5.l, s1, s2
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
 
-v_sub_nc_i16 v5, s105, s105
+v_sub_nc_i16 v5.l, s105, s105
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
 
-v_sub_nc_i16 v5, vcc_lo, ttmp15
+v_sub_nc_i16 v5.l, vcc_lo, ttmp15
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_sub_nc_i16 v5, vcc_hi, 0xfe0b
+v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_sub_nc_i16 v5, ttmp15, src_scc
+v_sub_nc_i16 v5.l, ttmp15, src_scc
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_sub_nc_i16 v5, m0, 0.5
+v_sub_nc_i16 v5.l, m0, 0.5
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_sub_nc_i16 v5, exec_lo, -1
+v_sub_nc_i16 v5.l, exec_lo, -1
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
 
-v_sub_nc_i16 v5, exec_hi, null
+v_sub_nc_i16 v5.l, exec_hi, null
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_sub_nc_i16 v5, null, exec_lo op_sel:[1,1,1]
+v_sub_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1]
 // GFX12: encoding: [0x05,0x58,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_sub_nc_i16 v5, -1, exec_hi op_sel:[0,0,0]
+v_sub_nc_i16 v5.l, -1, exec_hi op_sel:[0,0,0]
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_sub_nc_i16 v5, 0.5, m0 op_sel:[1,0,0]
+v_sub_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0]
 // GFX12: encoding: [0x05,0x08,0x0e,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_sub_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
 // GFX12: encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX12: encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi clamp
 // GFX12: encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_sub_nc_i32 v5, v1, v2
@@ -5786,49 +5813,58 @@ v_sub_nc_i32 v5, src_scc, vcc_lo
 v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp
 // GFX12: encoding: [0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_sub_nc_u16 v5, v1, v2
+v_sub_nc_u16 v5.l, v1.l, v2.l
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
 
-v_sub_nc_u16 v5, v255, v255
+v_sub_nc_u16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_u16 v5.l, v255.l, v255.l
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
 
-v_sub_nc_u16 v5, s1, s2
+v_sub_nc_u16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+
+v_sub_nc_u16 v5.l, s1, s2
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
 
-v_sub_nc_u16 v5, s105, s105
+v_sub_nc_u16 v5.l, s105, s105
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
 
-v_sub_nc_u16 v5, vcc_lo, ttmp15
+v_sub_nc_u16 v5.l, vcc_lo, ttmp15
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_sub_nc_u16 v5, vcc_hi, 0xfe0b
+v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_sub_nc_u16 v5, ttmp15, src_scc
+v_sub_nc_u16 v5.l, ttmp15, src_scc
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_sub_nc_u16 v5, m0, 0.5
+v_sub_nc_u16 v5.l, m0, 0.5
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_sub_nc_u16 v5, exec_lo, -1
+v_sub_nc_u16 v5.l, exec_lo, -1
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
 
-v_sub_nc_u16 v5, exec_hi, null
+v_sub_nc_u16 v5.l, exec_hi, null
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_sub_nc_u16 v5, null, exec_lo op_sel:[1,1,1]
+v_sub_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1]
 // GFX12: encoding: [0x05,0x58,0x04,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_sub_nc_u16 v5, -1, exec_hi op_sel:[0,0,0]
+v_sub_nc_u16 v5.l, -1, exec_hi op_sel:[0,0,0]
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_sub_nc_u16 v5, 0.5, m0 op_sel:[1,0,0]
+v_sub_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0]
 // GFX12: encoding: [0x05,0x08,0x04,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_sub_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
 // GFX12: encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX12: encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi clamp
 // GFX12: encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_subrev_co_u32 v5, s6, v1, v2
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index adf37901fc85fd..56bd0ee4b47465 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -214,47 +214,71 @@ v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bo
 v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0x00,0x47,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX12: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_add_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x80,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX12: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_add_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x80,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_add_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x26,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -298,47 +322,71 @@ v_add_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
 v_add_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0x80,0x26,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX12: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_add_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x80,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX12: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_add_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x80,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -4622,47 +4670,71 @@ v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:
 v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0xfc,0x01,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX12: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_sub_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x80,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX12: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_sub_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x80,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_sub_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -4706,47 +4778,71 @@ v_sub_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
 v_sub_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0x80,0x25,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX12: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_sub_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x80,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX12: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_sub_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x80,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_subrev_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x06,0x02,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -5001,30 +5097,6 @@ v_xor_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_xor_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX12: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX12: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX12: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX12: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
 v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
 // GFX12: [0x05,0x0a,0x12,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
@@ -5250,30 +5322,6 @@ v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 ban
 v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
 // GFX12: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX12: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX12: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX12: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX12: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
 v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
 // GFX12: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index 1be122faccbc92..6331d22c6976d9 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -134,14 +134,38 @@ v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x00,0x47,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_add_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x80,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX12: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x0d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_add_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x80,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x0d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0xc0,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x26,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -152,14 +176,38 @@ v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_add_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x80,0x26,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_add_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x80,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX12: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x03,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_add_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x80,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x03,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0xc0,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_alignbit_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x16,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -3043,14 +3091,38 @@ v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0xfc,0x01,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_sub_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x80,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX12: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x0e,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_sub_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x80,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x0e,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0xc0,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x25,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -3061,14 +3133,38 @@ v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sub_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x80,0x25,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_sub_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x80,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX12: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x04,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_sub_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x80,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x04,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0xc0,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_subrev_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x06,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -3210,30 +3306,6 @@ v_xor_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_xor_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x00,0x64,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX12: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX12: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
 v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x0a,0x12,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
@@ -3459,30 +3531,6 @@ v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
 v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1
 // GFX12: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX12: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX12: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
 v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4]
 // GFX12: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
index 07058a64515920..365caa5f9b6d09 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
@@ -189,49 +189,112 @@
 # GFX11: v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf
 
-# GFX11: v_add_nc_i16 v5, v1, v2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
-0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00
-
-# GFX11: v_add_nc_i16 v5, v255, v255             ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
-0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00
-
-# GFX11: v_add_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00
 
-# GFX11: v_add_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX11: v_add_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX11: v_add_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_add_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX11: v_add_nc_i16 v5, m0, 0x3800
+# W32-REAL16: v_add_nc_i16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX11: v_add_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX11: v_add_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX11: v_add_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX11: v_add_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
-0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00
-
-# GFX11: v_add_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0]
-0x05,0x08,0x0d,0xd7,0xf0,0xfa,0x00,0x00
-
-# GFX11: v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
-0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00
+# W32-REAL16: v_add_nc_i16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+0x05,0x00,0x0d,0xd7,0xf0,0xfa,0x00,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00
+
+# W32-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
 # GFX11: v_add_nc_i32 v5, v1, v2                 ; encoding: [0x05,0x00,0x26,0xd7,0x01,0x05,0x02,0x00]
@@ -279,49 +342,112 @@
 # GFX11: v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
 
-# GFX11: v_add_nc_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
-0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00
-
-# GFX11: v_add_nc_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
-0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00
-
-# GFX11: v_add_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00
 
-# GFX11: v_add_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX11: v_add_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX11: v_add_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_add_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX11: v_add_nc_u16 v5, m0, 0x3800
+# W32-REAL16: v_add_nc_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX11: v_add_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX11: v_add_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX11: v_add_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX11: v_add_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00]
-0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00
-
-# GFX11: v_add_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0]
-0x05,0x08,0x03,0xd7,0xf0,0xfa,0x00,0x00
-
-# GFX11: v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
-0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00
+# W32-REAL16: v_add_nc_u16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+0x05,0x00,0x03,0xd7,0xf0,0xfa,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
 # GFX11: v_alignbit_b32 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x16,0xd6,0x01,0x05,0x0e,0x00]
@@ -5871,49 +5997,112 @@
 # GFX11: v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
 
-# GFX11: v_sub_nc_i16 v5, v1, v2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
-0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00
-
-# GFX11: v_sub_nc_i16 v5, v255, v255             ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
-0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00
-
-# GFX11: v_sub_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00
 
-# GFX11: v_sub_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX11: v_sub_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX11: v_sub_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_sub_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX11: v_sub_nc_i16 v5, m0, 0x3800
+# W32-REAL16: v_sub_nc_i16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX11: v_sub_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX11: v_sub_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX11: v_sub_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX11: v_sub_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
-0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00
-
-# GFX11: v_sub_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0]
-0x05,0x08,0x0e,0xd7,0xf0,0xfa,0x00,0x00
-
-# GFX11: v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
-0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00
+# W32-REAL16: v_sub_nc_i16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+0x05,0x00,0x0e,0xd7,0xf0,0xfa,0x00,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00
+
+# W32-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
 # GFX11: v_sub_nc_i32 v5, v1, v2                 ; encoding: [0x05,0x00,0x25,0xd7,0x01,0x05,0x02,0x00]
@@ -5961,49 +6150,112 @@
 # GFX11: v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
 
-# GFX11: v_sub_nc_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
-0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00
-
-# GFX11: v_sub_nc_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
-0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00
-
-# GFX11: v_sub_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00
 
-# GFX11: v_sub_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX11: v_sub_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX11: v_sub_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_sub_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX11: v_sub_nc_u16 v5, m0, 0x3800
+# W32-REAL16: v_sub_nc_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX11: v_sub_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX11: v_sub_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX11: v_sub_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX11: v_sub_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00]
-0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00
-
-# GFX11: v_sub_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0]
-0x05,0x08,0x04,0xd7,0xf0,0xfa,0x00,0x00
-
-# GFX11: v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
-0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00
+# W32-REAL16: v_sub_nc_u16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+0x05,0x00,0x04,0xd7,0xf0,0xfa,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
 # W32: v_subrev_co_u32 v5, s12, v1, v2           ; encoding: [0x05,0x0c,0x02,0xd7,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
index 4ae8b053f0e0f3..d0bd6398ad10a0 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
@@ -3824,88 +3824,220 @@
 # W64-FAKE16: v_xor_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
 # GFX11: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -4664,88 +4796,220 @@
 # GFX11: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
 # GFX11: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
index b44dba748666c7..cbf5a3d11e50b8 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
@@ -2168,34 +2168,112 @@
 # W64-FAKE16: v_xor_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x64,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x64,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX11: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -2840,34 +2918,112 @@
 # GFX11: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX11: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
index af04a31423b6fe..c87c8855f5cdc0 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
@@ -153,49 +153,112 @@
 # GFX12: v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf
 
-# GFX12: v_add_nc_i16 v5, v1, v2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v1, v2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v1, v2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00
 
-# GFX12: v_add_nc_i16 v5, v255, v255             ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v255, v255             ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v255, v255             ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00
 
-# GFX12: v_add_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00
 
-# GFX12: v_add_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX12: v_add_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX12: v_add_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_add_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX12: v_add_nc_i16 v5, m0, 0x3800
+# W32-REAL16: v_add_nc_i16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX12: v_add_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX12: v_add_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX12: v_add_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX12: v_add_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX12: v_add_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0]
+# W32-REAL16: v_add_nc_i16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x08,0x0d,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX12: v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX12: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
 # GFX12: v_add_nc_i32 v5, v1, v2                 ; encoding: [0x05,0x00,0x26,0xd7,0x01,0x05,0x02,0x00]
@@ -243,49 +306,112 @@
 # GFX12: v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
 
-# GFX12: v_add_nc_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00
 
-# GFX12: v_add_nc_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00
 
-# GFX12: v_add_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00
 
-# GFX12: v_add_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX12: v_add_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX12: v_add_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_add_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX12: v_add_nc_u16 v5, m0, 0x3800
+# W32-REAL16: v_add_nc_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX12: v_add_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX12: v_add_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX12: v_add_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX12: v_add_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX12: v_add_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0]
+# W32-REAL16: v_add_nc_u16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x08,0x03,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX12: v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX12: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
 # GFX12: v_alignbit_b32 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x16,0xd6,0x01,0x05,0x0e,0x00]
@@ -5797,49 +5923,112 @@
 # GFX12: v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
 
-# GFX12: v_sub_nc_i16 v5, v1, v2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v1, v2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v1, v2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00
 
-# GFX12: v_sub_nc_i16 v5, v255, v255             ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v255, v255             ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v255, v255             ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00
 
-# GFX12: v_sub_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00
 
-# GFX12: v_sub_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX12: v_sub_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX12: v_sub_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_sub_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX12: v_sub_nc_i16 v5, m0, 0x3800
+# W32-REAL16: v_sub_nc_i16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX12: v_sub_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX12: v_sub_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX12: v_sub_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX12: v_sub_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX12: v_sub_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0]
+# W32-REAL16: v_sub_nc_i16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x08,0x0e,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX12: v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX12: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
 # GFX12: v_sub_nc_i32 v5, v1, v2                 ; encoding: [0x05,0x00,0x25,0xd7,0x01,0x05,0x02,0x00]
@@ -5887,49 +6076,112 @@
 # GFX12: v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
 
-# GFX12: v_sub_nc_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00
 
-# GFX12: v_sub_nc_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00
 
-# GFX12: v_sub_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00
 
-# GFX12: v_sub_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX12: v_sub_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX12: v_sub_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_sub_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX12: v_sub_nc_u16 v5, m0, 0x3800
+# W32-REAL16: v_sub_nc_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX12: v_sub_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX12: v_sub_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX12: v_sub_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX12: v_sub_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX12: v_sub_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0]
+# W32-REAL16: v_sub_nc_u16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x08,0x04,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX12: v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX12: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
 # W32: v_subrev_co_u32 v5, s12, v1, v2           ; encoding: [0x05,0x0c,0x02,0xd7,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
index 65cfdd5ef7de03..5081b9811e43ef 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
@@ -4115,88 +4115,268 @@
 # W64-FAKE16: v_xor_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
 # GFX12: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -5000,88 +5180,268 @@
 # GFX12: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
 # GFX12: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
index 4640b967cbc07b..77f05027d1cfd2 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
@@ -2393,34 +2393,160 @@
 # W64-FAKE16: v_xor_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x64,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x64,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -3113,34 +3239,160 @@
 # GFX12: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]

>From 95c24cb9de54f81b07ee4abd594fc32905063c68 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty at gmail.com>
Date: Wed, 16 Oct 2024 16:33:13 +0200
Subject: [PATCH 77/82] [libc][math][c23] Add exp10m1f16 C23 math function
 (#105706)

Part of #95250.
---
 libc/config/gpu/entrypoints.txt              |   1 +
 libc/config/linux/x86_64/entrypoints.txt     |   1 +
 libc/docs/math/index.rst                     |   2 +-
 libc/spec/stdc.td                            |   2 +
 libc/src/math/CMakeLists.txt                 |   2 +
 libc/src/math/exp10m1f16.h                   |  21 +++
 libc/src/math/generic/CMakeLists.txt         |  23 +++
 libc/src/math/generic/exp10f16.cpp           |  47 +-----
 libc/src/math/generic/exp10m1f16.cpp         | 163 +++++++++++++++++++
 libc/src/math/generic/expxf16.h              |  47 ++++++
 libc/test/src/math/CMakeLists.txt            |  11 ++
 libc/test/src/math/exp10m1f16_test.cpp       |  40 +++++
 libc/test/src/math/smoke/CMakeLists.txt      |  13 ++
 libc/test/src/math/smoke/exp10m1f16_test.cpp | 113 +++++++++++++
 libc/utils/MPFRWrapper/MPFRUtils.cpp         |  25 +++
 libc/utils/MPFRWrapper/MPFRUtils.h           |   1 +
 16 files changed, 467 insertions(+), 45 deletions(-)
 create mode 100644 libc/src/math/exp10m1f16.h
 create mode 100644 libc/src/math/generic/exp10m1f16.cpp
 create mode 100644 libc/test/src/math/exp10m1f16_test.cpp
 create mode 100644 libc/test/src/math/smoke/exp10m1f16_test.cpp

diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index b4cfe47f4505fd..251ad43ece8d05 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -522,6 +522,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.ceilf16
     libc.src.math.copysignf16
     libc.src.math.exp10f16
+    libc.src.math.exp10m1f16
     libc.src.math.exp2f16
     libc.src.math.expf16
     libc.src.math.f16add
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 2589da3756e155..3ca14ec03de3c7 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -611,6 +611,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.ceilf16
     libc.src.math.copysignf16
     libc.src.math.exp10f16
+    libc.src.math.exp10m1f16
     libc.src.math.exp2f16
     libc.src.math.exp2m1f16
     libc.src.math.expf16
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 72e8f6689a3623..95ac7f4f12f958 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -292,7 +292,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | exp10     | |check|          | |check|         |                        | |check|              |                        | 7.12.6.2               | F.10.3.2                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| exp10m1   |                  |                 |                        |                      |                        | 7.12.6.3               | F.10.3.3                   |
+| exp10m1   |                  |                 |                        | |check|              |                        | 7.12.6.3               | F.10.3.3                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | exp2      | |check|          | |check|         |                        | |check|              |                        | 7.12.6.4               | F.10.3.4                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 1b255690c2e453..ea032ba5f66e71 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -692,6 +692,8 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"exp10f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           GuardedFunctionSpec<"exp10f16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
 
+          GuardedFunctionSpec<"exp10m1f16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
+
           FunctionSpec<"remainder", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"remainderf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
           FunctionSpec<"remainderl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 7803369583de47..ecf63968481458 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -127,6 +127,8 @@ add_math_entrypoint_object(exp10)
 add_math_entrypoint_object(exp10f)
 add_math_entrypoint_object(exp10f16)
 
+add_math_entrypoint_object(exp10m1f16)
+
 add_math_entrypoint_object(expm1)
 add_math_entrypoint_object(expm1f)
 add_math_entrypoint_object(expm1f16)
diff --git a/libc/src/math/exp10m1f16.h b/libc/src/math/exp10m1f16.h
new file mode 100644
index 00000000000000..e195bc431f2e14
--- /dev/null
+++ b/libc/src/math/exp10m1f16.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for exp10m1f16 --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_EXP10M1F16_H
+#define LLVM_LIBC_SRC_MATH_EXP10M1F16_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float16 exp10m1f16(float16 x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_EXP10M1F16_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 1ad611fa168cc0..ffa74970a2ab5d 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1656,6 +1656,29 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  exp10m1f16
+  SRCS
+    exp10m1f16.cpp
+  HDRS
+    ../exp10m1f16.h
+  DEPENDS
+    .expxf16
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+    libc.src.__support.macros.properties.cpu_features
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   expm1
   SRCS
diff --git a/libc/src/math/generic/exp10f16.cpp b/libc/src/math/generic/exp10f16.cpp
index 1c5966c1f1c126..f7a8ee3245eda6 100644
--- a/libc/src/math/generic/exp10f16.cpp
+++ b/libc/src/math/generic/exp10f16.cpp
@@ -54,16 +54,6 @@ static constexpr fputil::ExceptValues<float16, N_EXP10F16_EXCEPTS>
 #endif
     }};
 
-// Generated by Sollya with the following commands:
-//   > display = hexadecimal;
-//   > round(log2(10), SG, RN);
-static constexpr float LOG2F_10 = 0x1.a934fp+1f;
-
-// Generated by Sollya with the following commands:
-//   > display = hexadecimal;
-//   > round(log10(2), SG, RN);
-static constexpr float LOG10F_2 = 0x1.344136p-2f;
-
 LLVM_LIBC_FUNCTION(float16, exp10f16, (float16 x)) {
   using FPBits = fputil::FPBits<float16>;
   FPBits x_bits(x);
@@ -132,40 +122,9 @@ LLVM_LIBC_FUNCTION(float16, exp10f16, (float16 x)) {
   if (auto r = EXP10F16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
     return r.value();
 
-  // For -8 < x < 5, to compute 10^x, we perform the following range reduction:
-  // find hi, mid, lo, such that:
-  //   x = (hi + mid) * log2(10) + lo, in which
-  //     hi is an integer,
-  //     mid * 2^3 is an integer,
-  //     -2^(-4) <= lo < 2^(-4).
-  // In particular,
-  //   hi + mid = round(x * 2^3) * 2^(-3).
-  // Then,
-  //   10^x = 10^(hi + mid + lo) = 2^((hi + mid) * log2(10)) + 10^lo
-  // We store 2^mid in the lookup table EXP2_MID_BITS, and compute 2^hi * 2^mid
-  // by adding hi to the exponent field of 2^mid.  10^lo is computed using a
-  // degree-4 minimax polynomial generated by Sollya.
-
-  float xf = x;
-  float kf = fputil::nearest_integer(xf * (LOG2F_10 * 0x1.0p+3f));
-  int x_hi_mid = static_cast<int>(kf);
-  int x_hi = x_hi_mid >> 3;
-  int x_mid = x_hi_mid & 0x7;
-  // lo = x - (hi + mid) = round(x * 2^3 * log2(10)) * log10(2) * (-2^(-3)) + x
-  float lo = fputil::multiply_add(kf, LOG10F_2 * -0x1.0p-3f, xf);
-
-  uint32_t exp2_hi_mid_bits =
-      EXP2_MID_BITS[x_mid] +
-      static_cast<uint32_t>(x_hi << fputil::FPBits<float>::FRACTION_LEN);
-  float exp2_hi_mid = fputil::FPBits<float>(exp2_hi_mid_bits).get_val();
-  // Degree-4 minimax polynomial generated by Sollya with the following
-  // commands:
-  //   > display = hexadecimal;
-  //   > P = fpminimax((10^x - 1)/x, 3, [|SG...|], [-2^-4, 2^-4]);
-  //   > 1 + x * P;
-  float exp10_lo = fputil::polyeval(lo, 0x1p+0f, 0x1.26bb14p+1f, 0x1.53526p+1f,
-                                    0x1.04b434p+1f, 0x1.2bcf9ep+0f);
-  return fputil::cast<float16>(exp2_hi_mid * exp10_lo);
+  // 10^x = 2^((hi + mid) * log2(10)) * 10^lo
+  auto [exp2_hi_mid, exp10_lo] = exp10_range_reduction(x);
+  return static_cast<float16>(exp2_hi_mid * exp10_lo);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/exp10m1f16.cpp b/libc/src/math/generic/exp10m1f16.cpp
new file mode 100644
index 00000000000000..9f2c1959fa5ec9
--- /dev/null
+++ b/libc/src/math/generic/exp10m1f16.cpp
@@ -0,0 +1,163 @@
+//===-- Half-precision 10^x - 1 function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/exp10m1f16.h"
+#include "expxf16.h"
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+#include "src/__support/macros/properties/cpu_features.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+static constexpr fputil::ExceptValues<float16, 3> EXP10M1F16_EXCEPTS_LO = {{
+    // (input, RZ output, RU offset, RD offset, RN offset)
+    // x = 0x1.5c4p-4, exp10m1f16(x) = 0x1.bacp-3 (RZ)
+    {0x2d71U, 0x32ebU, 1U, 0U, 0U},
+    // x = -0x1.5ep-13, exp10m1f16(x) = -0x1.92cp-12 (RZ)
+    {0x8978U, 0x8e4bU, 0U, 1U, 0U},
+    // x = -0x1.e2p-10, exp10m1f16(x) = -0x1.14cp-8 (RZ)
+    {0x9788U, 0x9c53U, 0U, 1U, 0U},
+}};
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 3;
+#else
+static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 6;
+#endif
+
+static constexpr fputil::ExceptValues<float16, N_EXP10M1F16_EXCEPTS_HI>
+    EXP10M1F16_EXCEPTS_HI = {{
+        // (input, RZ output, RU offset, RD offset, RN offset)
+        // x = 0x1.8f4p-2, exp10m1f16(x) = 0x1.744p+0 (RZ)
+        {0x363dU, 0x3dd1U, 1U, 0U, 0U},
+        // x = 0x1.95cp-2, exp10m1f16(x) = 0x1.7d8p+0 (RZ)
+        {0x3657U, 0x3df6U, 1U, 0U, 0U},
+        // x = 0x1.d04p-2, exp10m1f16(x) = 0x1.d7p+0 (RZ)
+        {0x3741U, 0x3f5cU, 1U, 0U, 1U},
+#ifndef LIBC_TARGET_CPU_HAS_FMA
+        // x = 0x1.0cp+1, exp10m1f16(x) = 0x1.ec4p+6 (RZ)
+        {0x4030U, 0x57b1U, 1U, 0U, 1U},
+        // x = 0x1.1b8p+1, exp10m1f16(x) = 0x1.45cp+7 (RZ)
+        {0x406eU, 0x5917U, 1U, 0U, 1U},
+        // x = 0x1.2f4p+2, exp10m1f16(x) = 0x1.ab8p+15 (RZ)
+        {0x44bdU, 0x7aaeU, 1U, 0U, 1U},
+#endif
+    }};
+
+LLVM_LIBC_FUNCTION(float16, exp10m1f16, (float16 x)) {
+  using FPBits = fputil::FPBits<float16>;
+  FPBits x_bits(x);
+
+  uint16_t x_u = x_bits.uintval();
+  uint16_t x_abs = x_u & 0x7fffU;
+
+  // When |x| <= 2^(-3), or |x| >= 11 * log10(2), or x is NaN.
+  if (LIBC_UNLIKELY(x_abs <= 0x3000U || x_abs >= 0x429fU)) {
+    // exp10m1(NaN) = NaN
+    if (x_bits.is_nan()) {
+      if (x_bits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+
+      return x;
+    }
+
+    // When x >= 16 * log10(2).
+    if (x_u >= 0x44d1U && x_bits.is_pos()) {
+      // exp10m1(+inf) = +inf
+      if (x_bits.is_inf())
+        return FPBits::inf().get_val();
+
+      switch (fputil::quick_get_round()) {
+      case FE_TONEAREST:
+      case FE_UPWARD:
+        fputil::set_errno_if_required(ERANGE);
+        fputil::raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
+        return FPBits::inf().get_val();
+      default:
+        return FPBits::max_normal().get_val();
+      }
+    }
+
+    // When x < -11 * log10(2).
+    if (x_u > 0xc29fU) {
+      // exp10m1(-inf) = -1
+      if (x_bits.is_inf())
+        return FPBits::one(Sign::NEG).get_val();
+
+      // When x >= -0x1.ce4p+1, round(10^x - 1, HP, RN) = -0x1.ffcp-1.
+      if (x_u <= 0xc339U) {
+        return fputil::round_result_slightly_down(
+            fputil::cast<float16>(-0x1.ffcp-1));
+      }
+
+      // When x < -0x1.ce4p+1, round(10^x - 1, HP, RN) = -1.
+      switch (fputil::quick_get_round()) {
+      case FE_TONEAREST:
+      case FE_DOWNWARD:
+        return FPBits::one(Sign::NEG).get_val();
+      default:
+        return fputil::cast<float16>(-0x1.ffcp-1);
+      }
+    }
+
+    // When |x| <= 2^(-3).
+    if (x_abs <= 0x3000U) {
+      if (auto r = EXP10M1F16_EXCEPTS_LO.lookup(x_u);
+          LIBC_UNLIKELY(r.has_value()))
+        return r.value();
+
+      float xf = x;
+      // Degree-5 minimax polynomial generated by Sollya with the following
+      // commands:
+      //   > display = hexadecimal;
+      //   > P = fpminimax((10^x - 1)/x, 4, [|SG...|], [-2^-3, 2^-3]);
+      //   > x * P;
+      return fputil::cast<float16>(
+          xf * fputil::polyeval(xf, 0x1.26bb1cp+1f, 0x1.5351c8p+1f,
+                                0x1.04704p+1f, 0x1.2ce084p+0f, 0x1.14a6bep-1f));
+    }
+  }
+
+  // When x is 1, 2, or 3. These are hard-to-round cases with exact results.
+  // 10^4 - 1 = 9'999 is not exactly representable as a float16, but luckily the
+  // polynomial approximation gives the correct result for x = 4 in all
+  // rounding modes.
+  if (LIBC_UNLIKELY((x_u & ~(0x3c00U | 0x4000U | 0x4200U | 0x4400U)) == 0)) {
+    switch (x_u) {
+    case 0x3c00U: // x = 1.0f16
+      return fputil::cast<float16>(9.0);
+    case 0x4000U: // x = 2.0f16
+      return fputil::cast<float16>(99.0);
+    case 0x4200U: // x = 3.0f16
+      return fputil::cast<float16>(999.0);
+    }
+  }
+
+  if (auto r = EXP10M1F16_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+
+  // exp10(x) = exp2((hi + mid) * log2(10)) * exp10(lo)
+  auto [exp2_hi_mid, exp10_lo] = exp10_range_reduction(x);
+  // exp10m1(x) = exp2((hi + mid) * log2(lo)) * exp10(lo) - 1
+  return fputil::cast<float16>(
+      fputil::multiply_add(exp2_hi_mid, exp10_lo, -1.0f));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/expxf16.h b/libc/src/math/generic/expxf16.h
index 35294130a15007..8de329bd2ab07f 100644
--- a/libc/src/math/generic/expxf16.h
+++ b/libc/src/math/generic/expxf16.h
@@ -127,6 +127,53 @@ LIBC_INLINE ExpRangeReduction exp2_range_reduction(float16 x) {
   return {exp2_hi_mid, exp2_lo};
 }
 
+// Generated by Sollya with the following commands:
+//   > display = hexadecimal;
+//   > round(log2(10), SG, RN);
+static constexpr float LOG2F_10 = 0x1.a934fp+1f;
+
+// Generated by Sollya with the following commands:
+//   > display = hexadecimal;
+//   > round(log10(2), SG, RN);
+static constexpr float LOG10F_2 = 0x1.344136p-2f;
+
+LIBC_INLINE ExpRangeReduction exp10_range_reduction(float16 x) {
+  // For -8 < x < 5, to compute 10^x, we perform the following range reduction:
+  // find hi, mid, lo, such that:
+  //   x = (hi + mid) * log2(10) + lo, in which
+  //     hi is an integer,
+  //     mid * 2^3 is an integer,
+  //     -2^(-4) <= lo < 2^(-4).
+  // In particular,
+  //   hi + mid = round(x * 2^3) * 2^(-3).
+  // Then,
+  //   10^x = 10^(hi + mid + lo) = 2^((hi + mid) * log2(10)) + 10^lo
+  // We store 2^mid in the lookup table EXP2_MID_BITS, and compute 2^hi * 2^mid
+  // by adding hi to the exponent field of 2^mid.  10^lo is computed using a
+  // degree-4 minimax polynomial generated by Sollya.
+
+  float xf = x;
+  float kf = fputil::nearest_integer(xf * (LOG2F_10 * 0x1.0p+3f));
+  int x_hi_mid = static_cast<int>(kf);
+  int x_hi = x_hi_mid >> 3;
+  int x_mid = x_hi_mid & 0x7;
+  // lo = x - (hi + mid) = round(x * 2^3 * log2(10)) * log10(2) * (-2^(-3)) + x
+  float lo = fputil::multiply_add(kf, LOG10F_2 * -0x1.0p-3f, xf);
+
+  uint32_t exp2_hi_mid_bits =
+      EXP2_MID_BITS[x_mid] +
+      static_cast<uint32_t>(x_hi << fputil::FPBits<float>::FRACTION_LEN);
+  float exp2_hi_mid = fputil::FPBits<float>(exp2_hi_mid_bits).get_val();
+  // Degree-4 minimax polynomial generated by Sollya with the following
+  // commands:
+  //   > display = hexadecimal;
+  //   > P = fpminimax((10^x - 1)/x, 3, [|SG...|], [-2^-4, 2^-4]);
+  //   > 1 + x * P;
+  float exp10_lo = fputil::polyeval(lo, 0x1p+0f, 0x1.26bb14p+1f, 0x1.53526p+1f,
+                                    0x1.04b434p+1f, 0x1.2bcf9ep+0f);
+  return {exp2_hi_mid, exp10_lo};
+}
+
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_MATH_GENERIC_EXPXF16_H
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 12e1d078b29b32..5dff0b49125b96 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1062,6 +1062,17 @@ add_fp_unittest(
     libc.src.math.exp10f16
 )
 
+add_fp_unittest(
+  exp10m1f16_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    exp10m1f16_test.cpp
+  DEPENDS
+    libc.src.math.exp10m1f16
+)
+
 add_fp_unittest(
   copysign_test
   SUITE
diff --git a/libc/test/src/math/exp10m1f16_test.cpp b/libc/test/src/math/exp10m1f16_test.cpp
new file mode 100644
index 00000000000000..41bb12f7d0973a
--- /dev/null
+++ b/libc/test/src/math/exp10m1f16_test.cpp
@@ -0,0 +1,40 @@
+//===-- Exhaustive test for exp10m1f16 ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/exp10m1f16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+using LlvmLibcExp10m1f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+// Range: [0, Inf];
+static constexpr uint16_t POS_START = 0x0000U;
+static constexpr uint16_t POS_STOP = 0x7c00U;
+
+// Range: [-Inf, 0];
+static constexpr uint16_t NEG_START = 0x8000U;
+static constexpr uint16_t NEG_STOP = 0xfc00U;
+
+TEST_F(LlvmLibcExp10m1f16Test, PositiveRange) {
+  for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x,
+                                   LIBC_NAMESPACE::exp10m1f16(x), 0.5);
+  }
+}
+
+TEST_F(LlvmLibcExp10m1f16Test, NegativeRange) {
+  for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x,
+                                   LIBC_NAMESPACE::exp10m1f16(x), 0.5);
+  }
+}
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 447ea695271397..6b3623dc0d0dbf 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -1235,6 +1235,19 @@ add_fp_unittest(
     libc.src.__support.FPUtil.cast
 )
 
+add_fp_unittest(
+  exp10m1f16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    exp10m1f16_test.cpp
+  DEPENDS
+    libc.hdr.fenv_macros
+    libc.src.errno.errno
+    libc.src.math.exp10m1f16
+    libc.src.__support.FPUtil.cast
+)
+
 add_fp_unittest(
   copysign_test
   SUITE
diff --git a/libc/test/src/math/smoke/exp10m1f16_test.cpp b/libc/test/src/math/smoke/exp10m1f16_test.cpp
new file mode 100644
index 00000000000000..dfa7fa477d3d11
--- /dev/null
+++ b/libc/test/src/math/smoke/exp10m1f16_test.cpp
@@ -0,0 +1,113 @@
+//===-- Unittests for exp10m1f16 ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/errno/libc_errno.h"
+#include "src/math/exp10m1f16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcExp10m1f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+TEST_F(LlvmLibcExp10m1f16Test, SpecialNumbers) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp10m1f16(aNaN));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp10m1f16(sNaN),
+                              FE_INVALID);
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::exp10m1f16(inf));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
+                            LIBC_NAMESPACE::exp10m1f16(neg_inf));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::exp10m1f16(zero));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, LIBC_NAMESPACE::exp10m1f16(neg_zero));
+  EXPECT_MATH_ERRNO(0);
+}
+
+TEST_F(LlvmLibcExp10m1f16Test, Overflow) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp10m1f16(max_normal),
+                              FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  // round(16 * log10(2), HP, RN);
+  float16 x = LIBC_NAMESPACE::fputil::cast<float16>(0x1.344p+2);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(
+      inf, LIBC_NAMESPACE::exp10m1f16(x), FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(
+      inf, LIBC_NAMESPACE::exp10m1f16(x), FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD(
+      max_normal, LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO(
+      max_normal, LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+  EXPECT_MATH_ERRNO(0);
+}
+
+TEST_F(LlvmLibcExp10m1f16Test, ResultNearNegOne) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
+                              LIBC_NAMESPACE::exp10m1f16(neg_max_normal),
+                              FE_INEXACT);
+
+  // round(-11 * log10(2), HP, RD);
+  float16 x = LIBC_NAMESPACE::fputil::cast<float16>(-0x1.a8p+1);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(
+      LIBC_NAMESPACE::fputil::cast<float16>(-0x1.ffcp-1),
+      LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(
+      LIBC_NAMESPACE::fputil::cast<float16>(-0x1.ffcp-1),
+      LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD(
+      LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
+      LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO(
+      LIBC_NAMESPACE::fputil::cast<float16>(-0x1.ffcp-1),
+      LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+  // Next float16 value below -0x1.ce4p+1.
+  x = LIBC_NAMESPACE::fputil::cast<float16>(-0x1.ce8p+1);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(
+      LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
+      LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(
+      LIBC_NAMESPACE::fputil::cast<float16>(-0x1.ffcp-1),
+      LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD(
+      LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
+      LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO(
+      LIBC_NAMESPACE::fputil::cast<float16>(-0x1.ffcp-1),
+      LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+}
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index eecffc782c1a74..bd4fbe294a622d 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -334,6 +334,29 @@ class MPFRNumber {
     return result;
   }
 
+  MPFRNumber exp10m1() const {
+    // TODO: Only use mpfr_exp10m1 once CI and buildbots get MPFR >= 4.2.0.
+#if MPFR_VERSION_MAJOR > 4 ||                                                  \
+    (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2)
+    MPFRNumber result(*this);
+    mpfr_exp10m1(result.value, value, mpfr_rounding);
+    return result;
+#else
+    unsigned int prec = mpfr_precision * 3;
+    MPFRNumber result(*this, prec);
+
+    MPFRNumber ln10(10.0f, prec);
+    // log(10)
+    mpfr_log(ln10.value, ln10.value, mpfr_rounding);
+    // x * log(10)
+    mpfr_mul(result.value, value, ln10.value, mpfr_rounding);
+    // e^(x * log(10)) - 1
+    int ex = mpfr_expm1(result.value, result.value, mpfr_rounding);
+    mpfr_subnormalize(result.value, ex, mpfr_rounding);
+    return result;
+#endif
+  }
+
   MPFRNumber expm1() const {
     MPFRNumber result(*this);
     mpfr_expm1(result.value, value, mpfr_rounding);
@@ -744,6 +767,8 @@ unary_operation(Operation op, InputType input, unsigned int precision,
     return mpfrInput.exp2m1();
   case Operation::Exp10:
     return mpfrInput.exp10();
+  case Operation::Exp10m1:
+    return mpfrInput.exp10m1();
   case Operation::Expm1:
     return mpfrInput.expm1();
   case Operation::Floor:
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index 8d51fa4e477267..9fc12a6adefb56 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -42,6 +42,7 @@ enum class Operation : int {
   Exp2,
   Exp2m1,
   Exp10,
+  Exp10m1,
   Expm1,
   Floor,
   Log,

>From 9381c6fd04cc16a7606633f57c96c11e58181ddb Mon Sep 17 00:00:00 2001
From: Krystian Stasiowski <sdkrystian at gmail.com>
Date: Wed, 16 Oct 2024 08:40:03 -0600
Subject: [PATCH 78/82] [Clang][Sema] Use the correct injected template
 arguments for partial specializations when collecting multi-level template
 argument lists (#112381)

After #111852 refactored multi-level template argument list collection,
the following results in a crash:
```
template<typename T, bool B>
struct A;

template<bool B>
struct A<int, B>
{
    void f() requires B;
};

template<bool B>
void A<int, B>::f() requires B { } // crash here
```

This happens because when collecting template arguments for constraint
normalization from a partial specialization, we incorrectly use the
template argument list of the partial specialization. We should be using
the template argument list of the _template-head_ (as defined in
[temp.arg.general] p2). Fixes #112222.
---
 clang/include/clang/AST/DeclTemplate.h        |  18 +-
 clang/lib/AST/DeclTemplate.cpp                |  28 ++
 clang/lib/Sema/SemaTemplateInstantiate.cpp    |   4 +-
 .../temp/temp.constr/temp.constr.decl/p4.cpp  | 284 ++++++++++--------
 4 files changed, 211 insertions(+), 123 deletions(-)

diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h
index 141f58c4600af0..0f0c0bf6e4ef4f 100644
--- a/clang/include/clang/AST/DeclTemplate.h
+++ b/clang/include/clang/AST/DeclTemplate.h
@@ -2085,7 +2085,11 @@ class ClassTemplateSpecializationDecl : public CXXRecordDecl,
 class ClassTemplatePartialSpecializationDecl
   : public ClassTemplateSpecializationDecl {
   /// The list of template parameters
-  TemplateParameterList* TemplateParams = nullptr;
+  TemplateParameterList *TemplateParams = nullptr;
+
+  /// The set of "injected" template arguments used within this
+  /// partial specialization.
+  TemplateArgument *InjectedArgs = nullptr;
 
   /// The class template partial specialization from which this
   /// class template partial specialization was instantiated.
@@ -2132,6 +2136,10 @@ class ClassTemplatePartialSpecializationDecl
     return TemplateParams;
   }
 
+  /// Retrieve the template arguments list of the template parameter list
+  /// of this template.
+  ArrayRef<TemplateArgument> getInjectedTemplateArgs();
+
   /// \brief All associated constraints of this partial specialization,
   /// including the requires clause and any constraints derived from
   /// constrained-parameters.
@@ -2856,6 +2864,10 @@ class VarTemplatePartialSpecializationDecl
   /// The list of template parameters
   TemplateParameterList *TemplateParams = nullptr;
 
+  /// The set of "injected" template arguments used within this
+  /// partial specialization.
+  TemplateArgument *InjectedArgs = nullptr;
+
   /// The variable template partial specialization from which this
   /// variable template partial specialization was instantiated.
   ///
@@ -2902,6 +2914,10 @@ class VarTemplatePartialSpecializationDecl
     return TemplateParams;
   }
 
+  /// Retrieve the template arguments list of the template parameter list
+  /// of this template.
+  ArrayRef<TemplateArgument> getInjectedTemplateArgs();
+
   /// \brief All associated constraints of this partial specialization,
   /// including the requires clause and any constraints derived from
   /// constrained-parameters.
diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index d9b67b7bedf5a5..d2d8907b884ec8 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -1185,6 +1185,20 @@ SourceRange ClassTemplatePartialSpecializationDecl::getSourceRange() const {
   return Range;
 }
 
+ArrayRef<TemplateArgument>
+ClassTemplatePartialSpecializationDecl::getInjectedTemplateArgs() {
+  TemplateParameterList *Params = getTemplateParameters();
+  auto *First = cast<ClassTemplatePartialSpecializationDecl>(getFirstDecl());
+  if (!First->InjectedArgs) {
+    auto &Context = getASTContext();
+    SmallVector<TemplateArgument, 16> TemplateArgs;
+    Context.getInjectedTemplateArgs(Params, TemplateArgs);
+    First->InjectedArgs = new (Context) TemplateArgument[TemplateArgs.size()];
+    std::copy(TemplateArgs.begin(), TemplateArgs.end(), First->InjectedArgs);
+  }
+  return llvm::ArrayRef(First->InjectedArgs, Params->size());
+}
+
 //===----------------------------------------------------------------------===//
 // FriendTemplateDecl Implementation
 //===----------------------------------------------------------------------===//
@@ -1535,6 +1549,20 @@ SourceRange VarTemplatePartialSpecializationDecl::getSourceRange() const {
   return Range;
 }
 
+ArrayRef<TemplateArgument>
+VarTemplatePartialSpecializationDecl::getInjectedTemplateArgs() {
+  TemplateParameterList *Params = getTemplateParameters();
+  auto *First = cast<VarTemplatePartialSpecializationDecl>(getFirstDecl());
+  if (!First->InjectedArgs) {
+    auto &Context = getASTContext();
+    SmallVector<TemplateArgument, 16> TemplateArgs;
+    Context.getInjectedTemplateArgs(Params, TemplateArgs);
+    First->InjectedArgs = new (Context) TemplateArgument[TemplateArgs.size()];
+    std::copy(TemplateArgs.begin(), TemplateArgs.end(), First->InjectedArgs);
+  }
+  return llvm::ArrayRef(First->InjectedArgs, Params->size());
+}
+
 static TemplateParameterList *
 createMakeIntegerSeqParameterList(const ASTContext &C, DeclContext *DC) {
   // typename T
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 8c7f694c09042e..8665c099903dc3 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -237,7 +237,7 @@ struct TemplateInstantiationArgumentCollecter
     if (Innermost)
       AddInnermostTemplateArguments(VTPSD);
     else if (ForConstraintInstantiation)
-      AddOuterTemplateArguments(VTPSD, VTPSD->getTemplateArgs().asArray(),
+      AddOuterTemplateArguments(VTPSD, VTPSD->getInjectedTemplateArgs(),
                                 /*Final=*/false);
 
     if (VTPSD->isMemberSpecialization())
@@ -274,7 +274,7 @@ struct TemplateInstantiationArgumentCollecter
     if (Innermost)
       AddInnermostTemplateArguments(CTPSD);
     else if (ForConstraintInstantiation)
-      AddOuterTemplateArguments(CTPSD, CTPSD->getTemplateArgs().asArray(),
+      AddOuterTemplateArguments(CTPSD, CTPSD->getInjectedTemplateArgs(),
                                 /*Final=*/false);
 
     if (CTPSD->isMemberSpecialization())
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.decl/p4.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.decl/p4.cpp
index 70064f867e18e3..f144e14cd122f9 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.decl/p4.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.decl/p4.cpp
@@ -1,175 +1,219 @@
 // RUN: %clang_cc1 -std=c++20 -verify %s
 // expected-no-diagnostics
 
-template<typename T>
-concept D = true;
+namespace Primary {
+  template<typename T>
+  concept D = true;
 
-template<typename T>
-struct A {
-  template<typename U, bool V>
-  void f() requires V;
+  template<typename T>
+  struct A {
+    template<typename U, bool V>
+    void f() requires V;
 
-  template<>
-  void f<short, true>();
+    template<>
+    void f<short, true>();
+
+    template<D U>
+    void g();
+
+    template<typename U, bool V> requires V
+    struct B;
+
+    template<typename U, bool V> requires V
+    struct B<U*, V>;
+
+    template<>
+    struct B<short, true>;
+
+    template<D U>
+    struct C;
+
+    template<D U>
+    struct C<U*>;
 
+    template<typename U, bool V> requires V
+    static int x;
+
+    template<typename U, bool V> requires V
+    static int x<U*, V>;
+
+    template<>
+    int x<short, true>;
+
+    template<D U>
+    static int y;
+
+    template<D U>
+    static int y<U*>;
+  };
+
+  template<typename T>
+  template<typename U, bool V>
+  void A<T>::f() requires V { }
+
+  template<typename T>
   template<D U>
-  void g();
+  void A<T>::g() { }
 
+  template<typename T>
   template<typename U, bool V> requires V
-  struct B;
+  struct A<T>::B { };
 
+  template<typename T>
   template<typename U, bool V> requires V
-  struct B<U*, V>;
+  struct A<T>::B<U*, V> { };
 
-  template<>
-  struct B<short, true>;
+  template<typename T>
+  template<typename U, bool V> requires V
+  struct A<T>::B<U&, V> { };
 
+  template<typename T>
   template<D U>
-  struct C;
+  struct A<T>::C { };
 
+  template<typename T>
   template<D U>
-  struct C<U*>;
+  struct A<T>::C<U*> { };
 
+  template<typename T>
   template<typename U, bool V> requires V
-  static int x;
+  int A<T>::x = 0;
 
+  template<typename T>
   template<typename U, bool V> requires V
-  static int x<U*, V>;
+  int A<T>::x<U*, V> = 0;
 
-  template<>
-  int x<short, true>;
+  template<typename T>
+  template<typename U, bool V> requires V
+  int A<T>::x<U&, V> = 0;
 
+  template<typename T>
   template<D U>
-  static int y;
+  int A<T>::y = 0;
 
+  template<typename T>
   template<D U>
-  static int y<U*>;
-};
-
-template<typename T>
-template<typename U, bool V>
-void A<T>::f() requires V { }
+  int A<T>::y<U*> = 0;
 
-template<typename T>
-template<D U>
-void A<T>::g() { }
-
-template<typename T>
-template<typename U, bool V> requires V
-struct A<T>::B { };
+  template<>
+  template<typename U, bool V>
+  void A<short>::f() requires V;
 
-template<typename T>
-template<typename U, bool V> requires V
-struct A<T>::B<U*, V> { };
+  template<>
+  template<>
+  void A<short>::f<int, true>();
 
-template<typename T>
-template<typename U, bool V> requires V
-struct A<T>::B<U&, V> { };
+  template<>
+  template<>
+  void A<void>::f<int, true>();
 
-template<typename T>
-template<D U>
-struct A<T>::C { };
+  template<>
+  template<D U>
+  void A<short>::g();
 
-template<typename T>
-template<D U>
-struct A<T>::C<U*> { };
+  template<>
+  template<typename U, bool V> requires V
+  struct A<int>::B;
 
-template<typename T>
-template<typename U, bool V> requires V
-int A<T>::x = 0;
+  template<>
+  template<>
+  struct A<int>::B<int, true>;
 
-template<typename T>
-template<typename U, bool V> requires V
-int A<T>::x<U*, V> = 0;
+  template<>
+  template<>
+  struct A<void>::B<int, true>;
 
-template<typename T>
-template<typename U, bool V> requires V
-int A<T>::x<U&, V> = 0;
+  template<>
+  template<typename U, bool V> requires V
+  struct A<int>::B<U*, V>;
 
-template<typename T>
-template<D U>
-int A<T>::y = 0;
+  template<>
+  template<typename U, bool V> requires V
+  struct A<int>::B<U&, V>;
 
-template<typename T>
-template<D U>
-int A<T>::y<U*> = 0;
+  template<>
+  template<D U>
+  struct A<int>::C;
 
-template<>
-template<typename U, bool V>
-void A<short>::f() requires V;
+  template<>
+  template<D U>
+  struct A<int>::C<U*>;
 
-template<>
-template<>
-void A<short>::f<int, true>();
+  template<>
+  template<D U>
+  struct A<int>::C<U&>;
 
-template<>
-template<>
-void A<void>::f<int, true>();
+  template<>
+  template<typename U, bool V> requires V
+  int A<long>::x;
 
-template<>
-template<D U>
-void A<short>::g();
+  template<>
+  template<>
+  int A<long>::x<int, true>;
 
-template<>
-template<typename U, bool V> requires V
-struct A<int>::B;
+  template<>
+  template<>
+  int A<void>::x<int, true>;
 
-template<>
-template<>
-struct A<int>::B<int, true>;
+  template<>
+  template<typename U, bool V> requires V
+  int A<long>::x<U*, V>;
 
-template<>
-template<>
-struct A<void>::B<int, true>;
+  template<>
+  template<typename U, bool V> requires V
+  int A<long>::x<U&, V>;
 
-template<>
-template<typename U, bool V> requires V
-struct A<int>::B<U*, V>;
+  template<>
+  template<D U>
+  int A<long>::y;
 
-template<>
-template<typename U, bool V> requires V
-struct A<int>::B<U&, V>;
+  template<>
+  template<D U>
+  int A<long>::y<U*>;
 
-template<>
-template<D U>
-struct A<int>::C;
+  template<>
+  template<D U>
+  int A<long>::y<U&>;
+} // namespace Primary
 
-template<>
-template<D U>
-struct A<int>::C<U*>;
+namespace Partial {
+  template<typename T, bool B>
+  struct A;
 
-template<>
-template<D U>
-struct A<int>::C<U&>;
+  template<bool U>
+  struct A<int, U>
+  {
+      template<typename V> requires U
+      void f();
 
-template<>
-template<typename U, bool V> requires V
-int A<long>::x;
+      template<typename V> requires U
+      static const int x;
 
-template<>
-template<>
-int A<long>::x<int, true>;
+      template<typename V> requires U
+      struct B;
+  };
 
-template<>
-template<>
-int A<void>::x<int, true>;
+  template<bool U>
+  template<typename V> requires U
+  void A<int, U>::f() { }
 
-template<>
-template<typename U, bool V> requires V
-int A<long>::x<U*, V>;
+  template<bool U>
+  template<typename V> requires U
+  constexpr int A<int, U>::x = 0;
 
-template<>
-template<typename U, bool V> requires V
-int A<long>::x<U&, V>;
+  template<bool U>
+  template<typename V> requires U
+  struct A<int, U>::B { };
 
-template<>
-template<D U>
-int A<long>::y;
+  template<>
+  template<typename V> requires true
+  void A<int, true>::f() { }
 
-template<>
-template<D U>
-int A<long>::y<U*>;
+  template<>
+  template<typename V> requires true
+  constexpr int A<int, true>::x = 1;
 
-template<>
-template<D U>
-int A<long>::y<U&>;
+  template<>
+  template<typename V> requires true
+  struct A<int, true>::B { };
+} // namespace Partial

>From 383df16317eec3b29b93025e2a86ea024b3f59c7 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett at linaro.org>
Date: Wed, 16 Oct 2024 15:40:54 +0100
Subject: [PATCH 79/82] [llvm][llvm-lit] Add total time for each testsuite in
 JUnit XML output (#112230)

Currently we write out a time taken to run all test suites:
```
<testsuites time="8.28">
```
And one for each test:
```
<testcase classname="lldb-shell.Breakpoint" name="breakpoint-command.test" time="2.38"/>
```
However, the schema says there should be one for each suite and test,
but none for testsuites:

https://github.com/windyroad/JUnit-Schema/blob/cfa434d4b8e102a8f55b8727b552a0063ee9044e/JUnit.xsd#L216

I'm leaving the `testsuites` time in though because no one has
complained so far, and someone out there probably has a script relying
on it by now. Most XML tools handle unknown attributes quite well
anyway.

I'm adding a per testsuite time to comply with the schema and maybe be
more compatible with other JUnit tools.
```
<testsuite name="lldb-shell" ... time="12.34">
```

The test suite time is the sum of the time taken for all tests in the
suite. This will ignore some overhead in setting up the suite, and means
that the sum of the times for all individual suites may not equal the
`testsuites` time.

As we're usually focusing on the execution time of particular tests, not
lit's book keeping, I think this is a reasonable choice.
---
 llvm/utils/lit/lit/reports.py         | 14 +++++++++++---
 llvm/utils/lit/tests/shtest-format.py |  2 +-
 llvm/utils/lit/tests/xunit-output.py  |  2 +-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/llvm/utils/lit/lit/reports.py b/llvm/utils/lit/lit/reports.py
index 2ac44b0c0ce86e..d2d719b076bc70 100755
--- a/llvm/utils/lit/lit/reports.py
+++ b/llvm/utils/lit/lit/reports.py
@@ -105,12 +105,20 @@ def write_results(self, tests, elapsed):
             file.write("</testsuites>\n")
 
     def _write_testsuite(self, file, suite, tests):
-        skipped = sum(1 for t in tests if t.result.code in self.skipped_codes)
-        failures = sum(1 for t in tests if t.isFailure())
+        skipped = 0
+        failures = 0
+        time = 0.0
+
+        for t in tests:
+            if t.result.code in self.skipped_codes:
+                skipped += 1
+            if t.isFailure():
+                failures += 1
+            time += t.result.elapsed
 
         name = suite.config.name.replace(".", "-")
         file.write(
-            f'<testsuite name={quo(name)} tests="{len(tests)}" failures="{failures}" skipped="{skipped}">\n'
+            f'<testsuite name={quo(name)} tests="{len(tests)}" failures="{failures}" skipped="{skipped}" time="{time:.2f}">\n'
         )
         for test in tests:
             self._write_test(file, test, name)
diff --git a/llvm/utils/lit/tests/shtest-format.py b/llvm/utils/lit/tests/shtest-format.py
index 4a3d65b7bce4fd..3a1959549e5d01 100644
--- a/llvm/utils/lit/tests/shtest-format.py
+++ b/llvm/utils/lit/tests/shtest-format.py
@@ -107,7 +107,7 @@
 
 # XUNIT: <?xml version="1.0" encoding="UTF-8"?>
 # XUNIT-NEXT: <testsuites time="{{[0-9.]+}}">
-# XUNIT-NEXT: <testsuite name="shtest-format" tests="22" failures="8" skipped="3">
+# XUNIT-NEXT: <testsuite name="shtest-format" tests="22" failures="8" skipped="3" time="{{[0-9.]+}}">
 
 # XUNIT: <testcase classname="shtest-format.external_shell" name="fail.txt" time="{{[0-9]+\.[0-9]+}}">
 # XUNIT-NEXT: <failure{{[ ]*}}>
diff --git a/llvm/utils/lit/tests/xunit-output.py b/llvm/utils/lit/tests/xunit-output.py
index 67d99849fe36d9..392cded4653fe1 100644
--- a/llvm/utils/lit/tests/xunit-output.py
+++ b/llvm/utils/lit/tests/xunit-output.py
@@ -9,7 +9,7 @@
 
 # CHECK:      <?xml version="1.0" encoding="UTF-8"?>
 # CHECK-NEXT: <testsuites time="{{[0-9.]+}}">
-# CHECK-NEXT: <testsuite name="test-data" tests="5" failures="1" skipped="3">
+# CHECK-NEXT: <testsuite name="test-data" tests="5" failures="1" skipped="3" time="{{[0-9.]+}}">
 # CHECK-NEXT: <testcase classname="test-data.test-data" name="bad&name.ini" time="{{[0-1]\.[0-9]+}}">
 # CHECK-NEXT:   <failure><![CDATA[& < > ]]]]><![CDATA[> &"]]></failure>
 # CHECK-NEXT: </testcase>

>From d9c95efb6c102fc9e9c52a558d611bb7aa433dbb Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 16 Oct 2024 15:43:30 +0100
Subject: [PATCH 80/82] [LLVM] Make more use of IRBuilder::CreateIntrinsic.
 NFC. (#112546)

Convert almost every instance of:
  CreateCall(Intrinsic::getOrInsertDeclaration(...), ...)
to the equivalent CreateIntrinsic call.
---
 llvm/lib/CodeGen/SafeStack.cpp                |   3 +-
 llvm/lib/CodeGen/StackProtector.cpp           |   6 +-
 llvm/lib/IR/AutoUpgrade.cpp                   | 150 +++++++-----------
 .../Target/AArch64/AArch64ISelLowering.cpp    |   3 +-
 llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp   |   5 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |   3 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |   7 +-
 llvm/lib/Target/X86/X86WinEHState.cpp         |  21 +--
 .../Instrumentation/AddressSanitizer.cpp      |   7 +-
 .../Instrumentation/HWAddressSanitizer.cpp    |  19 ++-
 llvm/lib/Transforms/Instrumentation/KCFI.cpp  |   3 +-
 .../Instrumentation/PGOInstrumentation.cpp    |  32 ++--
 .../Instrumentation/ThreadSanitizer.cpp       |   4 +-
 13 files changed, 101 insertions(+), 162 deletions(-)

diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index a50909af8bfcfb..ad2037a2c20b55 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -368,8 +368,7 @@ Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) {
 
   if (!StackGuardVar) {
     TL.insertSSPDeclarations(*M);
-    return IRB.CreateCall(
-        Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackguard));
+    return IRB.CreateIntrinsic(Intrinsic::stackguard, {}, {});
   }
 
   return IRB.CreateLoad(StackPtrTy, StackGuardVar, "StackGuard");
diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index a192161bbd9481..0ce305c4410d27 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -519,8 +519,7 @@ static Value *getStackGuard(const TargetLoweringBase *TLI, Module *M,
   if (SupportsSelectionDAGSP)
     *SupportsSelectionDAGSP = true;
   TLI->insertSSPDeclarations(*M);
-  return B.CreateCall(
-      Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackguard));
+  return B.CreateIntrinsic(Intrinsic::stackguard, {}, {});
 }
 
 /// Insert code into the entry block that stores the stack guard
@@ -541,8 +540,7 @@ static bool CreatePrologue(Function *F, Module *M, Instruction *CheckLoc,
   AI = B.CreateAlloca(PtrTy, nullptr, "StackGuardSlot");
 
   Value *GuardSlot = getStackGuard(TLI, M, B, &SupportsSelectionDAGSP);
-  B.CreateCall(Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackprotector),
-               {GuardSlot, AI});
+  B.CreateIntrinsic(Intrinsic::stackprotector, {}, {GuardSlot, AI});
   return SupportsSelectionDAGSP;
 }
 
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 32f66f77f19f24..519ff8d74c5af4 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1745,8 +1745,7 @@ static Value *upgradeX86VPERMT2Intrinsics(IRBuilder<> &Builder, CallBase &CI,
   if (!IndexForm)
     std::swap(Args[0], Args[1]);
 
-  Value *V = Builder.CreateCall(
-      Intrinsic::getOrInsertDeclaration(CI.getModule(), IID), Args);
+  Value *V = Builder.CreateIntrinsic(IID, {}, Args);
   Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty)
                              : Builder.CreateBitCast(CI.getArgOperand(1),
                                                      Ty);
@@ -2269,8 +2268,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
   SmallVector<Value *, 4> Args(CI.args());
   Args.pop_back();
   Args.pop_back();
-  Rep = Builder.CreateCall(
-      Intrinsic::getOrInsertDeclaration(CI.getModule(), IID), Args);
+  Rep = Builder.CreateIntrinsic(IID, {}, Args);
   unsigned NumArgs = CI.arg_size();
   Rep = emitX86Select(Builder, CI.getArgOperand(NumArgs - 1), Rep,
                       CI.getArgOperand(NumArgs - 2));
@@ -2325,25 +2323,21 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
   } else if (Name == "clz.ll") {
     // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 returns an i64.
     Value *Arg = CI->getArgOperand(0);
-    Value *Ctlz = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctlz,
-                                          {Arg->getType()}),
-        {Arg, Builder.getFalse()}, "ctlz");
+    Value *Ctlz = Builder.CreateIntrinsic(Intrinsic::ctlz, {Arg->getType()},
+                                          {Arg, Builder.getFalse()},
+                                          /*FMFSource=*/nullptr, "ctlz");
     Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc");
   } else if (Name == "popc.ll") {
     // llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 returns an
     // i64.
     Value *Arg = CI->getArgOperand(0);
-    Value *Popc = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctpop,
-                                          {Arg->getType()}),
-        Arg, "ctpop");
+    Value *Popc = Builder.CreateIntrinsic(Intrinsic::ctpop, {Arg->getType()},
+                                          Arg, /*FMFSource=*/nullptr, "ctpop");
     Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc");
   } else if (Name == "h2f") {
-    Rep = Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
-                                 F->getParent(), Intrinsic::convert_from_fp16,
-                                 {Builder.getFloatTy()}),
-                             CI->getArgOperand(0), "h2f");
+    Rep = Builder.CreateIntrinsic(Intrinsic::convert_from_fp16,
+                                  {Builder.getFloatTy()}, CI->getArgOperand(0),
+                                  /*FMFSource=*/nullptr, "h2f");
   } else if (Name.consume_front("bitcast.") &&
              (Name == "f2i" || Name == "i2f" || Name == "ll2d" ||
               Name == "d2ll")) {
@@ -2493,10 +2487,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
   } else if (Name.starts_with("avx.sqrt.p") ||
              Name.starts_with("sse2.sqrt.p") ||
              Name.starts_with("sse.sqrt.p")) {
-    Rep =
-        Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
-                               F->getParent(), Intrinsic::sqrt, CI->getType()),
-                           {CI->getArgOperand(0)});
+    Rep = Builder.CreateIntrinsic(Intrinsic::sqrt, CI->getType(),
+                                  {CI->getArgOperand(0)});
   } else if (Name.starts_with("avx512.mask.sqrt.p")) {
     if (CI->arg_size() == 4 &&
         (!isa<ConstantInt>(CI->getArgOperand(3)) ||
@@ -2505,13 +2497,10 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
                                           : Intrinsic::x86_avx512_sqrt_pd_512;
 
       Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(3)};
-      Rep = Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
+      Rep = Builder.CreateIntrinsic(IID, {}, Args);
     } else {
-      Rep = Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::sqrt,
-                                            CI->getType()),
-          {CI->getArgOperand(0)});
+      Rep = Builder.CreateIntrinsic(Intrinsic::sqrt, CI->getType(),
+                                    {CI->getArgOperand(0)});
     }
     Rep =
         emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1));
@@ -2635,9 +2624,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       break;
     }
 
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
-        {CI->getOperand(0), CI->getArgOperand(1)});
+    Rep = Builder.CreateIntrinsic(IID, {},
+                                  {CI->getOperand(0), CI->getArgOperand(1)});
     Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
   } else if (Name.starts_with("avx512.mask.fpclass.p")) {
     Type *OpTy = CI->getArgOperand(0)->getType();
@@ -2659,9 +2647,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     else
       llvm_unreachable("Unexpected intrinsic");
 
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
-        {CI->getOperand(0), CI->getArgOperand(1)});
+    Rep = Builder.CreateIntrinsic(IID, {},
+                                  {CI->getOperand(0), CI->getArgOperand(1)});
     Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
   } else if (Name.starts_with("avx512.cmp.p")) {
     SmallVector<Value *, 4> Args(CI->args());
@@ -2689,8 +2676,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       std::swap(Mask, Args.back());
     Args.push_back(Mask);
 
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Args);
+    Rep = Builder.CreateIntrinsic(IID, {}, Args);
   } else if (Name.starts_with("avx512.mask.cmp.")) {
     // Integer compare intrinsics.
     unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
@@ -3413,8 +3399,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       else
         IID = Intrinsic::x86_avx512_add_pd_512;
 
-      Rep = Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
+      Rep = Builder.CreateIntrinsic(
+          IID, {},
           {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
     } else {
       Rep = Builder.CreateFAdd(CI->getArgOperand(0), CI->getArgOperand(1));
@@ -3429,8 +3415,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       else
         IID = Intrinsic::x86_avx512_div_pd_512;
 
-      Rep = Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
+      Rep = Builder.CreateIntrinsic(
+          IID, {},
           {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
     } else {
       Rep = Builder.CreateFDiv(CI->getArgOperand(0), CI->getArgOperand(1));
@@ -3445,8 +3431,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       else
         IID = Intrinsic::x86_avx512_mul_pd_512;
 
-      Rep = Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
+      Rep = Builder.CreateIntrinsic(
+          IID, {},
           {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
     } else {
       Rep = Builder.CreateFMul(CI->getArgOperand(0), CI->getArgOperand(1));
@@ -3461,8 +3447,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       else
         IID = Intrinsic::x86_avx512_sub_pd_512;
 
-      Rep = Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
+      Rep = Builder.CreateIntrinsic(
+          IID, {},
           {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
     } else {
       Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1));
@@ -3479,16 +3465,15 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
         {Intrinsic::x86_avx512_min_ps_512, Intrinsic::x86_avx512_min_pd_512}};
     Intrinsic::ID IID = MinMaxTbl[IsMin][IsDouble];
 
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
+    Rep = Builder.CreateIntrinsic(
+        IID, {},
         {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
     Rep =
         emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2));
   } else if (Name.starts_with("avx512.mask.lzcnt.")) {
     Rep =
-        Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
-                               F->getParent(), Intrinsic::ctlz, CI->getType()),
-                           {CI->getArgOperand(0), Builder.getInt1(false)});
+        Builder.CreateIntrinsic(Intrinsic::ctlz, CI->getType(),
+                                {CI->getArgOperand(0), Builder.getInt1(false)});
     Rep =
         emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1));
   } else if (Name.starts_with("avx512.mask.psll")) {
@@ -3732,10 +3717,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     if (NegAcc)
       Ops[2] = Builder.CreateFNeg(Ops[2]);
 
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(CI->getModule(), Intrinsic::fma,
-                                          Ops[0]->getType()),
-        Ops);
+    Rep = Builder.CreateIntrinsic(Intrinsic::fma, Ops[0]->getType(), Ops);
 
     if (IsScalar)
       Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
@@ -3747,10 +3729,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     Ops[1] = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
     Ops[2] = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
 
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(CI->getModule(), Intrinsic::fma,
-                                          Ops[0]->getType()),
-        Ops);
+    Rep = Builder.CreateIntrinsic(Intrinsic::fma, Ops[0]->getType(), Ops);
 
     Rep = Builder.CreateInsertElement(Constant::getNullValue(CI->getType()),
                                       Rep, (uint64_t)0);
@@ -3846,9 +3825,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       else
         IID = Intrinsic::x86_avx512_vfmadd_pd_512;
 
-      Rep = Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
-          {A, B, C, CI->getArgOperand(4)});
+      Rep = Builder.CreateIntrinsic(IID, {}, {A, B, C, CI->getArgOperand(4)});
     } else {
       Function *FMA = Intrinsic::getOrInsertDeclaration(
           CI->getModule(), Intrinsic::fma, A->getType());
@@ -3878,8 +3855,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     Value *Ops[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                     CI->getArgOperand(2)};
     Ops[2] = Builder.CreateFNeg(Ops[2]);
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Ops);
+    Rep = Builder.CreateIntrinsic(IID, {}, Ops);
   } else if (Name.starts_with("avx512.mask.vfmaddsub.p") ||
              Name.starts_with("avx512.mask3.vfmaddsub.p") ||
              Name.starts_with("avx512.maskz.vfmaddsub.p") ||
@@ -3902,8 +3878,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       if (IsSubAdd)
         Ops[2] = Builder.CreateFNeg(Ops[2]);
 
-      Rep = Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Ops);
+      Rep = Builder.CreateIntrinsic(IID, {}, Ops);
     } else {
       int NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
 
@@ -3954,8 +3929,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2), CI->getArgOperand(3)};
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
+    Rep = Builder.CreateIntrinsic(IID, {}, Args);
     Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                : CI->getArgOperand(0);
     Rep = emitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru);
@@ -3982,8 +3956,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2)};
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
+    Rep = Builder.CreateIntrinsic(IID, {}, Args);
     Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                : CI->getArgOperand(0);
     Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
@@ -4018,8 +3991,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2)};
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
+    Rep = Builder.CreateIntrinsic(IID, {}, Args);
     Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                : CI->getArgOperand(0);
     Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
@@ -4048,8 +4020,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2)};
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
+    Rep = Builder.CreateIntrinsic(IID, {}, Args);
     Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                : CI->getArgOperand(0);
     Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
@@ -4071,8 +4042,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     // Make a call with 3 operands.
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2)};
-    Value *NewCall = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
+    Value *NewCall = Builder.CreateIntrinsic(IID, {}, Args);
 
     // Extract the second result and store it.
     Value *Data = Builder.CreateExtractValue(NewCall, 1);
@@ -4127,20 +4097,15 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
   if (Name == "mve.vctp64.old") {
     // Replace the old v4i1 vctp64 with a v2i1 vctp and predicate-casts to the
     // correct type.
-    Value *VCTP =
-        Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
-                               F->getParent(), Intrinsic::arm_mve_vctp64),
-                           CI->getArgOperand(0), CI->getName());
-    Value *C1 = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(
-            F->getParent(), Intrinsic::arm_mve_pred_v2i,
-            {VectorType::get(Builder.getInt1Ty(), 2, false)}),
-        VCTP);
-    return Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(
-            F->getParent(), Intrinsic::arm_mve_pred_i2v,
-            {VectorType::get(Builder.getInt1Ty(), 4, false)}),
-        C1);
+    Value *VCTP = Builder.CreateIntrinsic(Intrinsic::arm_mve_vctp64, {},
+                                          CI->getArgOperand(0),
+                                          /*FMFSource=*/nullptr, CI->getName());
+    Value *C1 = Builder.CreateIntrinsic(
+        Intrinsic::arm_mve_pred_v2i,
+        {VectorType::get(Builder.getInt1Ty(), 2, false)}, VCTP);
+    return Builder.CreateIntrinsic(
+        Intrinsic::arm_mve_pred_i2v,
+        {VectorType::get(Builder.getInt1Ty(), 4, false)}, C1);
   } else if (Name == "mve.mull.int.predicated.v2i64.v4i32.v4i1" ||
              Name == "mve.vqdmull.predicated.v2i64.v4i32.v4i1" ||
              Name == "mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1" ||
@@ -4198,15 +4163,10 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     for (Value *Op : CI->args()) {
       Type *Ty = Op->getType();
       if (Ty->getScalarSizeInBits() == 1) {
-        Value *C1 = Builder.CreateCall(
-            Intrinsic::getOrInsertDeclaration(
-                F->getParent(), Intrinsic::arm_mve_pred_v2i,
-                {VectorType::get(Builder.getInt1Ty(), 4, false)}),
-            Op);
-        Op = Builder.CreateCall(
-            Intrinsic::getOrInsertDeclaration(
-                F->getParent(), Intrinsic::arm_mve_pred_i2v, {V2I1Ty}),
-            C1);
+        Value *C1 = Builder.CreateIntrinsic(
+            Intrinsic::arm_mve_pred_v2i,
+            {VectorType::get(Builder.getInt1Ty(), 4, false)}, Op);
+        Op = Builder.CreateIntrinsic(Intrinsic::arm_mve_pred_i2v, {V2I1Ty}, C1);
       }
       Ops.push_back(Op);
     }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ed06d8a5d63013..6ec492227d9f19 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27284,8 +27284,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
     IRBuilderBase &Builder) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Builder.CreateCall(
-      Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_clrex));
+  Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {}, {});
 }
 
 Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index cfce56f0bfe968..51af16c48f7097 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -921,9 +921,8 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
   FunctionCallee AsanFreeFunc = M.getOrInsertFunction(
       StringRef("__asan_free_impl"),
       FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty}, false));
-  Value *ReturnAddr = IRB.CreateCall(
-      Intrinsic::getOrInsertDeclaration(&M, Intrinsic::returnaddress),
-      IRB.getInt32(0));
+  Value *ReturnAddr =
+      IRB.CreateIntrinsic(Intrinsic::returnaddress, {}, IRB.getInt32(0));
   Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty);
   Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty);
   IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt});
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index a35582bebb08a3..75a4ccb7b35329 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21446,8 +21446,7 @@ void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
   if (!Subtarget->hasV7Ops())
     return;
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Builder.CreateCall(
-      Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_clrex));
+  Builder.CreateIntrinsic(Intrinsic::arm_clrex, {}, {});
 }
 
 Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 911d92f0c4846b..cec1e507f08f2f 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12205,11 +12205,8 @@ Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
     // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
     // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
     if (isa<LoadInst>(Inst))
-      return Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(
-              Builder.GetInsertBlock()->getParent()->getParent(),
-              Intrinsic::ppc_cfence, {Inst->getType()}),
-          {Inst});
+      return Builder.CreateIntrinsic(Intrinsic::ppc_cfence, {Inst->getType()},
+                                     {Inst});
     // FIXME: Can use isync for rmw operation.
     return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
   }
diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp
index 05fc6f13129f24..bc9fd801f94b22 100644
--- a/llvm/lib/Target/X86/X86WinEHState.cpp
+++ b/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -333,12 +333,10 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
     // If using _except_handler4, the EHGuard contains: FramePtr xor Cookie.
     if (UseStackGuard) {
       Value *Val = Builder.CreateLoad(Int32Ty, Cookie);
-      Value *FrameAddr = Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(
-              TheModule, Intrinsic::frameaddress,
-              Builder.getPtrTy(
-                  TheModule->getDataLayout().getAllocaAddrSpace())),
-          Builder.getInt32(0), "frameaddr");
+      Value *FrameAddr = Builder.CreateIntrinsic(
+          Intrinsic::frameaddress,
+          Builder.getPtrTy(TheModule->getDataLayout().getAllocaAddrSpace()),
+          Builder.getInt32(0), /*FMFSource=*/nullptr, "frameaddr");
       Value *FrameAddrI32 = Builder.CreatePtrToInt(FrameAddr, Int32Ty);
       FrameAddrI32 = Builder.CreateXor(FrameAddrI32, Val);
       Builder.CreateStore(FrameAddrI32, EHGuardNode);
@@ -369,8 +367,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
 }
 
 Value *WinEHStatePass::emitEHLSDA(IRBuilder<> &Builder, Function *F) {
-  return Builder.CreateCall(
-      Intrinsic::getOrInsertDeclaration(TheModule, Intrinsic::x86_seh_lsda), F);
+  return Builder.CreateIntrinsic(Intrinsic::x86_seh_lsda, {}, F);
 }
 
 /// Generate a thunk that puts the LSDA of ParentFunc in EAX and then calls
@@ -624,17 +621,13 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
   // that it can recover the original frame pointer.
   IRBuilder<> Builder(RegNode->getNextNode());
   Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getPtrTy());
-  Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
-                         TheModule, Intrinsic::x86_seh_ehregnode),
-                     {RegNodeI8});
+  Builder.CreateIntrinsic(Intrinsic::x86_seh_ehregnode, {}, {RegNodeI8});
 
   if (EHGuardNode) {
     IRBuilder<> Builder(EHGuardNode->getNextNode());
     Value *EHGuardNodeI8 =
         Builder.CreateBitCast(EHGuardNode, Builder.getPtrTy());
-    Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
-                           TheModule, Intrinsic::x86_seh_ehguard),
-                       {EHGuardNodeI8});
+    Builder.CreateIntrinsic(Intrinsic::x86_seh_ehguard, {}, {EHGuardNodeI8});
   }
 
   // Calculate state numbers.
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 02d9fab309d83b..9d4f05780cb6e7 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1866,10 +1866,9 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
   if (UseCalls && ClOptimizeCallbacks) {
     const ASanAccessInfo AccessInfo(IsWrite, CompileKernel, AccessSizeIndex);
     Module *M = IRB.GetInsertBlock()->getParent()->getParent();
-    IRB.CreateCall(
-        Intrinsic::getOrInsertDeclaration(M, Intrinsic::asan_check_memaccess),
-        {IRB.CreatePointerCast(Addr, PtrTy),
-         ConstantInt::get(Int32Ty, AccessInfo.Packed)});
+    IRB.CreateIntrinsic(Intrinsic::asan_check_memaccess, {},
+                        {IRB.CreatePointerCast(Addr, PtrTy),
+                         ConstantInt::get(Int32Ty, AccessInfo.Packed)});
     return;
   }
 
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 5ec4973ea03d8f..c55043a9b20f5c 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1041,19 +1041,18 @@ void HWAddressSanitizer::instrumentMemAccessOutline(Value *Ptr, bool IsWrite,
   }
 
   if (UseFixedShadowIntrinsic) {
-    IRB.CreateCall(
-        Intrinsic::getOrInsertDeclaration(
-            M, UseShortGranules
-                   ? Intrinsic::hwasan_check_memaccess_shortgranules_fixedshadow
-                   : Intrinsic::hwasan_check_memaccess_fixedshadow),
+    IRB.CreateIntrinsic(
+        UseShortGranules
+            ? Intrinsic::hwasan_check_memaccess_shortgranules_fixedshadow
+            : Intrinsic::hwasan_check_memaccess_fixedshadow,
+        {},
         {Ptr, ConstantInt::get(Int32Ty, AccessInfo),
          ConstantInt::get(Int64Ty, Mapping.offset())});
   } else {
-    IRB.CreateCall(Intrinsic::getOrInsertDeclaration(
-                       M, UseShortGranules
-                              ? Intrinsic::hwasan_check_memaccess_shortgranules
-                              : Intrinsic::hwasan_check_memaccess),
-                   {ShadowBase, Ptr, ConstantInt::get(Int32Ty, AccessInfo)});
+    IRB.CreateIntrinsic(
+        UseShortGranules ? Intrinsic::hwasan_check_memaccess_shortgranules
+                         : Intrinsic::hwasan_check_memaccess,
+        {}, {ShadowBase, Ptr, ConstantInt::get(Int32Ty, AccessInfo)});
   }
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/KCFI.cpp b/llvm/lib/Transforms/Instrumentation/KCFI.cpp
index bbe0f4c6178192..4b653a83a896e7 100644
--- a/llvm/lib/Transforms/Instrumentation/KCFI.cpp
+++ b/llvm/lib/Transforms/Instrumentation/KCFI.cpp
@@ -110,8 +110,7 @@ PreservedAnalyses KCFIPass::run(Function &F, FunctionAnalysisManager &AM) {
     Instruction *ThenTerm =
         SplitBlockAndInsertIfThen(Test, Call, false, VeryUnlikelyWeights);
     Builder.SetInsertPoint(ThenTerm);
-    Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(&M, Intrinsic::debugtrap));
+    Builder.CreateIntrinsic(Intrinsic::debugtrap, {}, {});
     ++NumKCFIChecks;
   }
 
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index e6e474ed376069..919660e7a040f2 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -918,8 +918,8 @@ void FunctionInstrumenter::instrument() {
     IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt());
     // llvm.instrprof.cover(i8* <name>, i64 <hash>, i32 <num-counters>,
     //                      i32 <index>)
-    Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(&M, Intrinsic::instrprof_cover),
+    Builder.CreateIntrinsic(
+        Intrinsic::instrprof_cover, {},
         {NormalizedNamePtr, CFGHash, Builder.getInt32(1), Builder.getInt32(0)});
     return;
   }
@@ -971,10 +971,10 @@ void FunctionInstrumenter::instrument() {
     IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt());
     // llvm.instrprof.timestamp(i8* <name>, i64 <hash>, i32 <num-counters>,
     //                          i32 <index>)
-    Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(&M, Intrinsic::instrprof_timestamp),
-        {NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters),
-         Builder.getInt32(I)});
+    Builder.CreateIntrinsic(Intrinsic::instrprof_timestamp, {},
+                            {NormalizedNamePtr, CFGHash,
+                             Builder.getInt32(NumCounters),
+                             Builder.getInt32(I)});
     I += PGOBlockCoverage ? 8 : 1;
   }
 
@@ -984,12 +984,12 @@ void FunctionInstrumenter::instrument() {
            "Cannot get the Instrumentation point");
     // llvm.instrprof.increment(i8* <name>, i64 <hash>, i32 <num-counters>,
     //                          i32 <index>)
-    Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
-                           &M, PGOBlockCoverage
-                                   ? Intrinsic::instrprof_cover
-                                   : Intrinsic::instrprof_increment),
-                       {NormalizedNamePtr, CFGHash,
-                        Builder.getInt32(NumCounters), Builder.getInt32(I++)});
+    Builder.CreateIntrinsic(PGOBlockCoverage ? Intrinsic::instrprof_cover
+                                             : Intrinsic::instrprof_increment,
+                            {},
+                            {NormalizedNamePtr, CFGHash,
+                             Builder.getInt32(NumCounters),
+                             Builder.getInt32(I++)});
   }
 
   // Now instrument select instructions:
@@ -1726,10 +1726,10 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
   auto *NormalizedFuncNameVarPtr =
       ConstantExpr::getPointerBitCastOrAddrSpaceCast(
           FuncNameVar, PointerType::get(M->getContext(), 0));
-  Builder.CreateCall(
-      Intrinsic::getOrInsertDeclaration(M, Intrinsic::instrprof_increment_step),
-      {NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash),
-       Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step});
+  Builder.CreateIntrinsic(Intrinsic::instrprof_increment_step, {},
+                          {NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash),
+                           Builder.getInt32(TotalNumCtrs),
+                           Builder.getInt32(*CurCtrIdx), Step});
   ++(*CurCtrIdx);
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 388addfab181a4..915dc70336ded9 100644
--- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -572,9 +572,7 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
   if ((Res || HasCalls) && ClInstrumentFuncEntryExit) {
     InstrumentationIRBuilder IRB(F.getEntryBlock().getFirstNonPHI());
     Value *ReturnAddress =
-        IRB.CreateCall(Intrinsic::getOrInsertDeclaration(
-                           F.getParent(), Intrinsic::returnaddress),
-                       IRB.getInt32(0));
+        IRB.CreateIntrinsic(Intrinsic::returnaddress, {}, IRB.getInt32(0));
     IRB.CreateCall(TsanFuncEntry, ReturnAddress);
 
     EscapeEnumerator EE(F, "tsan_cleanup", ClHandleCxxExceptions);

>From 9255850e89b1e538e11fcc8b71cfd0b320546a75 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 16 Oct 2024 16:10:45 +0100
Subject: [PATCH 81/82] [LLVM] Remove unused variables after #112546

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp            | 1 -
 llvm/lib/Target/ARM/ARMISelLowering.cpp                    | 1 -
 llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp   | 1 -
 llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp | 1 -
 4 files changed, 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6ec492227d9f19..60150c3328aaa7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27283,7 +27283,6 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
 
 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
     IRBuilderBase &Builder) const {
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {}, {});
 }
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 75a4ccb7b35329..a49dda871dc3ac 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21445,7 +21445,6 @@ void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
     IRBuilderBase &Builder) const {
   if (!Subtarget->hasV7Ops())
     return;
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Builder.CreateIntrinsic(Intrinsic::arm_clrex, {}, {});
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 9d4f05780cb6e7..55e9903876b1d1 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1865,7 +1865,6 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
 
   if (UseCalls && ClOptimizeCallbacks) {
     const ASanAccessInfo AccessInfo(IsWrite, CompileKernel, AccessSizeIndex);
-    Module *M = IRB.GetInsertBlock()->getParent()->getParent();
     IRB.CreateIntrinsic(Intrinsic::asan_check_memaccess, {},
                         {IRB.CreatePointerCast(Addr, PtrTy),
                          ConstantInt::get(Int32Ty, AccessInfo.Packed)});
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index c55043a9b20f5c..21d4d37d7e6cdb 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1025,7 +1025,6 @@ void HWAddressSanitizer::instrumentMemAccessOutline(Value *Ptr, bool IsWrite,
         insertShadowTagCheck(Ptr, InsertBefore, DTU, LI).TagMismatchTerm;
 
   IRBuilder<> IRB(InsertBefore);
-  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
   bool UseFixedShadowIntrinsic = false;
   // The memaccess fixed shadow intrinsic is only supported on AArch64,
   // which allows a 16-bit immediate to be left-shifted by 32.

>From c4ba15184af6228b0d742b586f21ad6a571498e0 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 15 Oct 2024 22:02:18 +0400
Subject: [PATCH 82/82] clang/HIP: Remove REQUIRES libgcc from a test

---
 clang/test/Driver/hip-include-path.hip | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/test/Driver/hip-include-path.hip b/clang/test/Driver/hip-include-path.hip
index 1b4179e65c0b97..5eeee2f5ce0d5b 100644
--- a/clang/test/Driver/hip-include-path.hip
+++ b/clang/test/Driver/hip-include-path.hip
@@ -1,4 +1,3 @@
-// REQUIRES: libgcc
 // UNSUPPORTED: system-windows
 
 // RUN: %clang -c -### --target=x86_64-unknown-linux-gnu --cuda-gpu-arch=gfx900 \



More information about the llvm-branch-commits mailing list