[llvm] [AMDGPU] Remove wavefrontsize feature from GFX10+ (PR #98400)

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 11 00:58:51 PDT 2024


https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/98400

>From ac6a4836fd133969cc61a8e2af92cce6dfbb5c1b Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 10 Jul 2024 14:44:15 -0700
Subject: [PATCH 1/2] [AMDGPU] Remove wavefrontsize feature from GFX10+

Processor definition shall not include a default feature which
may be switched off by a different wave size. This allows not
to write -mattr=-wavefrontsize32,+wavefrontsize64 in tests.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   3 -
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |   8 +
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |  11 +-
 .../Disassembler/AMDGPUDisassembler.cpp       |  20 +-
 .../AMDGPU/check-subtarget-features.ll        |   2 -
 .../AMDGPU/llvm.amdgcn.wavefrontsize.ll       |  16 +-
 llvm/test/CodeGen/AMDGPU/unknown-processor.ll |   2 +-
 .../MC/AMDGPU/gfx11_asm_vopc_t16_promote.s    | 654 +++++++++---------
 llvm/test/MC/AMDGPU/wave32.s                  |   8 +-
 .../MC/Disassembler/AMDGPU/gfx10-wave32.txt   |   4 +-
 10 files changed, 378 insertions(+), 350 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 94e8e77b3c052..dfc8eaea66f7b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1464,7 +1464,6 @@ def FeatureISAVersion10_Common : FeatureSet<
    FeatureLDSBankCount32,
    FeatureDLInsts,
    FeatureNSAEncoding,
-   FeatureWavefrontSize32,
    FeatureBackOffBarrier]>;
 
 def FeatureISAVersion10_1_Common : FeatureSet<
@@ -1548,7 +1547,6 @@ def FeatureISAVersion11_Common : FeatureSet<
    FeatureDot10Insts,
    FeatureNSAEncoding,
    FeaturePartialNSAEncoding,
-   FeatureWavefrontSize32,
    FeatureShaderCyclesRegister,
    FeatureArchitectedFlatScratch,
    FeatureAtomicFaddRtnInsts,
@@ -1625,7 +1623,6 @@ def FeatureISAVersion12 : FeatureSet<
    FeatureDot11Insts,
    FeatureNSAEncoding,
    FeaturePartialNSAEncoding,
-   FeatureWavefrontSize32,
    FeatureShaderCyclesHiLoRegisters,
    FeatureArchitectedFlatScratch,
    FeatureArchitectedSGPRs,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 21fe1bc31a27e..a59893d3cf85d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -105,6 +105,14 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
   }
 
+  if (!hasFeature(AMDGPU::FeatureWavefrontSize32) &&
+      !hasFeature(AMDGPU::FeatureWavefrontSize64)) {
+    if (getGeneration() >= AMDGPUSubtarget::GFX10)
+      ToggleFeature(AMDGPU::FeatureWavefrontSize32);
+    else
+      ToggleFeature(AMDGPU::FeatureWavefrontSize64);
+  }
+
   // We don't support FP64 for EG/NI atm.
   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
 
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index b08957d22ee74..1c3925cfad464 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1408,9 +1408,18 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
       copySTI().ToggleFeature("southern-islands");
     }
 
+    AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
+    FeatureBitset FB = getFeatureBits();
+    if (!FB[AMDGPU::FeatureWavefrontSize64] &&
+        !FB[AMDGPU::FeatureWavefrontSize32]) {
+      if (ISA.Major >= 10)
+        copySTI().ToggleFeature(AMDGPU::FeatureWavefrontSize32);
+      else
+        copySTI().ToggleFeature(AMDGPU::FeatureWavefrontSize64);
+    }
+
     setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits()));
 
-    AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
     if (ISA.Major >= 6 && isHsaAbi(getSTI())) {
       createConstantSymbol(".amdgcn.gfx_generation_number", ISA.Major);
       createConstantSymbol(".amdgcn.gfx_generation_minor", ISA.Minor);
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 695b2f246a778..57d717dd9e634 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -45,10 +45,26 @@ using namespace llvm;
 
 using DecodeStatus = llvm::MCDisassembler::DecodeStatus;
 
+static const MCSubtargetInfo &addDefaultWaveSize(const MCSubtargetInfo &STI,
+                                                 MCContext &Ctx) {
+  if (!STI.hasFeature(AMDGPU::FeatureWavefrontSize64) &&
+      !STI.hasFeature(AMDGPU::FeatureWavefrontSize32)) {
+    MCSubtargetInfo &STICopy = Ctx.getSubtargetCopy(STI);
+    if (AMDGPU::isGFX10Plus(STI))
+      STICopy.ToggleFeature(AMDGPU::FeatureWavefrontSize32);
+    else
+      STICopy.ToggleFeature(AMDGPU::FeatureWavefrontSize64);
+    return STICopy;
+  }
+
+  return STI;
+}
+
 AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI,
                                        MCContext &Ctx, MCInstrInfo const *MCII)
-    : MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()),
-      MAI(*Ctx.getAsmInfo()), TargetMaxInstBytes(MAI.getMaxInstLength(&STI)),
+    : MCDisassembler(addDefaultWaveSize(STI, Ctx), Ctx), MCII(MCII),
+      MRI(*Ctx.getRegisterInfo()), MAI(*Ctx.getAsmInfo()),
+      TargetMaxInstBytes(MAI.getMaxInstLength(&STI)),
       CodeObjectVersion(AMDGPU::getDefaultAMDHSACodeObjectVersion()) {
   // ToDo: AMDGPUDisassembler supports only VI ISA.
   if (!STI.hasFeature(AMDGPU::FeatureGCN3Encoding) && !isGFX10Plus())
diff --git a/llvm/test/CodeGen/AMDGPU/check-subtarget-features.ll b/llvm/test/CodeGen/AMDGPU/check-subtarget-features.ll
index c246939811046..95ae8a6adfdf8 100644
--- a/llvm/test/CodeGen/AMDGPU/check-subtarget-features.ll
+++ b/llvm/test/CodeGen/AMDGPU/check-subtarget-features.ll
@@ -1,5 +1,3 @@
-; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,-wavefrontsize64 < %s 2>&1 | FileCheck %s -check-prefix=ERR -implicit-check-not=error:
-; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,-wavefrontsize64 < %s 2>&1 | FileCheck %s -check-prefix=ERR -implicit-check-not=error:
 ; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+wavefrontsize64 < %s 2>&1 | FileCheck %s -check-prefix=ERR -implicit-check-not=error:
 ; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+wavefrontsize64 < %s 2>&1 | FileCheck %s -check-prefix=ERR -implicit-check-not=error:
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index 270ab5fee1125..824d3708c027d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 
 ; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
 ; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
@@ -10,10 +10,10 @@
 ; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
 ; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
 ; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
 
 ; GCN-LABEL: {{^}}fold_wavefrontsize:
 ; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
diff --git a/llvm/test/CodeGen/AMDGPU/unknown-processor.ll b/llvm/test/CodeGen/AMDGPU/unknown-processor.ll
index 683ba98e52cf1..9cfba8b2e5c04 100644
--- a/llvm/test/CodeGen/AMDGPU/unknown-processor.ll
+++ b/llvm/test/CodeGen/AMDGPU/unknown-processor.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=amdgcn-- -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s
 ; RUN: llc -mtriple=r600-- -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s
 target datalayout = "A5"
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopc_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopc_t16_promote.s
index b16caed8b275f..75f20b0c7f0c4 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vopc_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopc_t16_promote.s
@@ -12,13 +12,13 @@ v_cmp_class_f16 vcc, vcc_hi, v255
 v_cmp_class_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_class_f16_e64
 
-v_cmp_class_f16 vcc_lo, v127, v255
+v_cmp_class_f16 vcc, v127, v255
 // GFX11: v_cmp_class_f16_e64
 
-v_cmp_class_f16 vcc_lo, vcc_hi, v255
+v_cmp_class_f16 vcc, vcc_hi, v255
 // GFX11: v_cmp_class_f16_e64
 
-v_cmp_class_f16 vcc_lo, vcc_lo, v255
+v_cmp_class_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_class_f16_e64
 
 v_cmp_eq_f16 vcc, v1, v255
@@ -33,16 +33,16 @@ v_cmp_eq_f16 vcc, vcc_hi, v255
 v_cmp_eq_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_eq_f16_e64
 
-v_cmp_eq_f16 vcc_lo, v1, v255
+v_cmp_eq_f16 vcc, v1, v255
 // GFX11: v_cmp_eq_f16_e64
 
-v_cmp_eq_f16 vcc_lo, v127, v255
+v_cmp_eq_f16 vcc, v127, v255
 // GFX11: v_cmp_eq_f16_e64
 
-v_cmp_eq_f16 vcc_lo, vcc_hi, v255
+v_cmp_eq_f16 vcc, vcc_hi, v255
 // GFX11: v_cmp_eq_f16_e64
 
-v_cmp_eq_f16 vcc_lo, vcc_lo, v255
+v_cmp_eq_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_eq_f16_e64
 
 v_cmp_eq_i16 vcc, v1, v255
@@ -57,16 +57,16 @@ v_cmp_eq_i16 vcc, vcc_hi, v255
 v_cmp_eq_i16 vcc, vcc_lo, v255
 // GFX11: v_cmp_eq_i16_e64
 
-v_cmp_eq_i16 vcc_lo, v1, v255
+v_cmp_eq_i16 vcc, v1, v255
 // GFX11: v_cmp_eq_i16_e64
 
-v_cmp_eq_i16 vcc_lo, v127, v255
+v_cmp_eq_i16 vcc, v127, v255
 // GFX11: v_cmp_eq_i16_e64
 
-v_cmp_eq_i16 vcc_lo, vcc_hi, v255
+v_cmp_eq_i16 vcc, vcc_hi, v255
 // GFX11: v_cmp_eq_i16_e64
 
-v_cmp_eq_i16 vcc_lo, vcc_lo, v255
+v_cmp_eq_i16 vcc, vcc_lo, v255
 // GFX11: v_cmp_eq_i16_e64
 
 v_cmp_eq_u16 vcc, v1, v255
@@ -81,16 +81,16 @@ v_cmp_eq_u16 vcc, vcc_hi, v255
 v_cmp_eq_u16 vcc, vcc_lo, v255
 // GFX11: v_cmp_eq_u16_e64
 
-v_cmp_eq_u16 vcc_lo, v1, v255
+v_cmp_eq_u16 vcc, v1, v255
 // GFX11: v_cmp_eq_u16_e64
 
-v_cmp_eq_u16 vcc_lo, v127, v255
+v_cmp_eq_u16 vcc, v127, v255
 // GFX11: v_cmp_eq_u16_e64
 
-v_cmp_eq_u16 vcc_lo, vcc_hi, v255
+v_cmp_eq_u16 vcc, vcc_hi, v255
 // GFX11: v_cmp_eq_u16_e64
 
-v_cmp_eq_u16 vcc_lo, vcc_lo, v255
+v_cmp_eq_u16 vcc, vcc_lo, v255
 // GFX11: v_cmp_eq_u16_e64
 
 v_cmp_f_f16 vcc, v1, v255
@@ -105,16 +105,16 @@ v_cmp_f_f16 vcc, vcc_hi, v255
 v_cmp_f_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_f_f16_e64
 
-v_cmp_f_f16 vcc_lo, v1, v255
+v_cmp_f_f16 vcc, v1, v255
 // GFX11: v_cmp_f_f16_e64
 
-v_cmp_f_f16 vcc_lo, v127, v255
+v_cmp_f_f16 vcc, v127, v255
 // GFX11: v_cmp_f_f16_e64
 
-v_cmp_f_f16 vcc_lo, vcc_hi, v255
+v_cmp_f_f16 vcc, vcc_hi, v255
 // GFX11: v_cmp_f_f16_e64
 
-v_cmp_f_f16 vcc_lo, vcc_lo, v255
+v_cmp_f_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_f_f16_e64
 
 v_cmp_ge_f16 vcc, v1, v255
@@ -129,16 +129,16 @@ v_cmp_ge_f16 vcc, vcc_hi, v255
 v_cmp_ge_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_ge_f16_e64
 
-v_cmp_ge_f16 vcc_lo, v1, v255
+v_cmp_ge_f16 vcc, v1, v255
 // GFX11: v_cmp_ge_f16_e64
 
-v_cmp_ge_f16 vcc_lo, v127, v255
+v_cmp_ge_f16 vcc, v127, v255
 // GFX11: v_cmp_ge_f16_e64
 
-v_cmp_ge_f16 vcc_lo, vcc_hi, v255
+v_cmp_ge_f16 vcc, vcc_hi, v255
 // GFX11: v_cmp_ge_f16_e64
 
-v_cmp_ge_f16 vcc_lo, vcc_lo, v255
+v_cmp_ge_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_ge_f16_e64
 
 v_cmp_ge_i16 vcc, v1, v255
@@ -153,16 +153,16 @@ v_cmp_ge_i16 vcc, vcc_hi, v255
 v_cmp_ge_i16 vcc, vcc_lo, v255
 // GFX11: v_cmp_ge_i16_e64
 
-v_cmp_ge_i16 vcc_lo, v1, v255
+v_cmp_ge_i16 vcc, v1, v255
 // GFX11: v_cmp_ge_i16_e64
 
-v_cmp_ge_i16 vcc_lo, v127, v255
+v_cmp_ge_i16 vcc, v127, v255
 // GFX11: v_cmp_ge_i16_e64
 
-v_cmp_ge_i16 vcc_lo, vcc_hi, v255
+v_cmp_ge_i16 vcc, vcc_hi, v255
 // GFX11: v_cmp_ge_i16_e64
 
-v_cmp_ge_i16 vcc_lo, vcc_lo, v255
+v_cmp_ge_i16 vcc, vcc_lo, v255
 // GFX11: v_cmp_ge_i16_e64
 
 v_cmp_ge_u16 vcc, v1, v255
@@ -177,16 +177,16 @@ v_cmp_ge_u16 vcc, vcc_hi, v255
 v_cmp_ge_u16 vcc, vcc_lo, v255
 // GFX11: v_cmp_ge_u16_e64
 
-v_cmp_ge_u16 vcc_lo, v1, v255
+v_cmp_ge_u16 vcc, v1, v255
 // GFX11: v_cmp_ge_u16_e64
 
-v_cmp_ge_u16 vcc_lo, v127, v255
+v_cmp_ge_u16 vcc, v127, v255
 // GFX11: v_cmp_ge_u16_e64
 
-v_cmp_ge_u16 vcc_lo, vcc_hi, v255
+v_cmp_ge_u16 vcc, vcc_hi, v255
 // GFX11: v_cmp_ge_u16_e64
 
-v_cmp_ge_u16 vcc_lo, vcc_lo, v255
+v_cmp_ge_u16 vcc, vcc_lo, v255
 // GFX11: v_cmp_ge_u16_e64
 
 v_cmp_gt_f16 vcc, v1, v255
@@ -201,16 +201,16 @@ v_cmp_gt_f16 vcc, vcc_hi, v255
 v_cmp_gt_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_gt_f16_e64
 
-v_cmp_gt_f16 vcc_lo, v1, v255
+v_cmp_gt_f16 vcc, v1, v255
 // GFX11: v_cmp_gt_f16_e64
 
-v_cmp_gt_f16 vcc_lo, v127, v255
+v_cmp_gt_f16 vcc, v127, v255
 // GFX11: v_cmp_gt_f16_e64
 
-v_cmp_gt_f16 vcc_lo, vcc_hi, v255
+v_cmp_gt_f16 vcc, vcc_hi, v255
 // GFX11: v_cmp_gt_f16_e64
 
-v_cmp_gt_f16 vcc_lo, vcc_lo, v255
+v_cmp_gt_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_gt_f16_e64
 
 v_cmp_gt_i16 vcc, v1, v255
@@ -225,16 +225,16 @@ v_cmp_gt_i16 vcc, vcc_hi, v255
 v_cmp_gt_i16 vcc, vcc_lo, v255
 // GFX11: v_cmp_gt_i16_e64
 
-v_cmp_gt_i16 vcc_lo, v1, v255
+v_cmp_gt_i16 vcc, v1, v255
 // GFX11: v_cmp_gt_i16_e64
 
-v_cmp_gt_i16 vcc_lo, v127, v255
+v_cmp_gt_i16 vcc, v127, v255
 // GFX11: v_cmp_gt_i16_e64
 
-v_cmp_gt_i16 vcc_lo, vcc_hi, v255
+v_cmp_gt_i16 vcc, vcc_hi, v255
 // GFX11: v_cmp_gt_i16_e64
 
-v_cmp_gt_i16 vcc_lo, vcc_lo, v255
+v_cmp_gt_i16 vcc, vcc_lo, v255
 // GFX11: v_cmp_gt_i16_e64
 
 v_cmp_gt_u16 vcc, v1, v255
@@ -249,16 +249,16 @@ v_cmp_gt_u16 vcc, vcc_hi, v255
 v_cmp_gt_u16 vcc, vcc_lo, v255
 // GFX11: v_cmp_gt_u16_e64
 
-v_cmp_gt_u16 vcc_lo, v1, v255
+v_cmp_gt_u16 vcc, v1, v255
 // GFX11: v_cmp_gt_u16_e64
 
-v_cmp_gt_u16 vcc_lo, v127, v255
+v_cmp_gt_u16 vcc, v127, v255
 // GFX11: v_cmp_gt_u16_e64
 
-v_cmp_gt_u16 vcc_lo, vcc_hi, v255
+v_cmp_gt_u16 vcc, vcc_hi, v255
 // GFX11: v_cmp_gt_u16_e64
 
-v_cmp_gt_u16 vcc_lo, vcc_lo, v255
+v_cmp_gt_u16 vcc, vcc_lo, v255
 // GFX11: v_cmp_gt_u16_e64
 
 v_cmp_le_f16 vcc, v1, v255
@@ -273,16 +273,16 @@ v_cmp_le_f16 vcc, vcc_hi, v255
 v_cmp_le_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_le_f16_e64
 
-v_cmp_le_f16 vcc_lo, v1, v255
+v_cmp_le_f16 vcc, v1, v255
 // GFX11: v_cmp_le_f16_e64
 
-v_cmp_le_f16 vcc_lo, v127, v255
+v_cmp_le_f16 vcc, v127, v255
 // GFX11: v_cmp_le_f16_e64
 
-v_cmp_le_f16 vcc_lo, vcc_hi, v255
+v_cmp_le_f16 vcc, vcc_hi, v255
 // GFX11: v_cmp_le_f16_e64
 
-v_cmp_le_f16 vcc_lo, vcc_lo, v255
+v_cmp_le_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_le_f16_e64
 
 v_cmp_le_i16 vcc, v1, v255
@@ -297,16 +297,16 @@ v_cmp_le_i16 vcc, vcc_hi, v255
 v_cmp_le_i16 vcc, vcc_lo, v255
 // GFX11: v_cmp_le_i16_e64
 
-v_cmp_le_i16 vcc_lo, v1, v255
+v_cmp_le_i16 vcc, v1, v255
 // GFX11: v_cmp_le_i16_e64
 
-v_cmp_le_i16 vcc_lo, v127, v255
+v_cmp_le_i16 vcc, v127, v255
 // GFX11: v_cmp_le_i16_e64
 
-v_cmp_le_i16 vcc_lo, vcc_hi, v255
+v_cmp_le_i16 vcc, vcc_hi, v255
 // GFX11: v_cmp_le_i16_e64
 
-v_cmp_le_i16 vcc_lo, vcc_lo, v255
+v_cmp_le_i16 vcc, vcc_lo, v255
 // GFX11: v_cmp_le_i16_e64
 
 v_cmp_le_u16 vcc, v1, v255
@@ -321,16 +321,16 @@ v_cmp_le_u16 vcc, vcc_hi, v255
 v_cmp_le_u16 vcc, vcc_lo, v255
 // GFX11: v_cmp_le_u16_e64
 
-v_cmp_le_u16 vcc_lo, v1, v255
+v_cmp_le_u16 vcc, v1, v255
 // GFX11: v_cmp_le_u16_e64
 
-v_cmp_le_u16 vcc_lo, v127, v255
+v_cmp_le_u16 vcc, v127, v255
 // GFX11: v_cmp_le_u16_e64
 
-v_cmp_le_u16 vcc_lo, vcc_hi, v255
+v_cmp_le_u16 vcc, vcc_hi, v255
 // GFX11: v_cmp_le_u16_e64
 
-v_cmp_le_u16 vcc_lo, vcc_lo, v255
+v_cmp_le_u16 vcc, vcc_lo, v255
 // GFX11: v_cmp_le_u16_e64
 
 v_cmp_lg_f16 vcc, v1, v255
@@ -345,16 +345,16 @@ v_cmp_lg_f16 vcc, vcc_hi, v255
 v_cmp_lg_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_lg_f16_e64
 
-v_cmp_lg_f16 vcc_lo, v1, v255
+v_cmp_lg_f16 vcc, v1, v255
 // GFX11: v_cmp_lg_f16_e64
 
-v_cmp_lg_f16 vcc_lo, v127, v255
+v_cmp_lg_f16 vcc, v127, v255
 // GFX11: v_cmp_lg_f16_e64
 
-v_cmp_lg_f16 vcc_lo, vcc_hi, v255
+v_cmp_lg_f16 vcc, vcc_hi, v255
 // GFX11: v_cmp_lg_f16_e64
 
-v_cmp_lg_f16 vcc_lo, vcc_lo, v255
+v_cmp_lg_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_lg_f16_e64
 
 v_cmp_lt_f16 vcc, v1, v255
@@ -369,16 +369,16 @@ v_cmp_lt_f16 vcc, vcc_hi, v255
 v_cmp_lt_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_lt_f16_e64
 
-v_cmp_lt_f16 vcc_lo, v1, v255
+v_cmp_lt_f16 vcc, v1, v255
 // GFX11: v_cmp_lt_f16_e64
 
-v_cmp_lt_f16 vcc_lo, v127, v255
+v_cmp_lt_f16 vcc, v127, v255
 // GFX11: v_cmp_lt_f16_e64
 
-v_cmp_lt_f16 vcc_lo, vcc_hi, v255
+v_cmp_lt_f16 vcc, vcc_hi, v255
 // GFX11: v_cmp_lt_f16_e64
 
-v_cmp_lt_f16 vcc_lo, vcc_lo, v255
+v_cmp_lt_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_lt_f16_e64
 
 v_cmp_lt_i16 vcc, v1, v255
@@ -393,16 +393,16 @@ v_cmp_lt_i16 vcc, vcc_hi, v255
 v_cmp_lt_i16 vcc, vcc_lo, v255
 // GFX11: v_cmp_lt_i16_e64
 
-v_cmp_lt_i16 vcc_lo, v1, v255
+v_cmp_lt_i16 vcc, v1, v255
 // GFX11: v_cmp_lt_i16_e64
 
-v_cmp_lt_i16 vcc_lo, v127, v255
+v_cmp_lt_i16 vcc, v127, v255
 // GFX11: v_cmp_lt_i16_e64
 
-v_cmp_lt_i16 vcc_lo, vcc_hi, v255
+v_cmp_lt_i16 vcc, vcc_hi, v255
 // GFX11: v_cmp_lt_i16_e64
 
-v_cmp_lt_i16 vcc_lo, vcc_lo, v255
+v_cmp_lt_i16 vcc, vcc_lo, v255
 // GFX11: v_cmp_lt_i16_e64
 
 v_cmp_lt_u16 vcc, v1, v255
@@ -417,16 +417,16 @@ v_cmp_lt_u16 vcc, vcc_hi, v255
 v_cmp_lt_u16 vcc, vcc_lo, v255
 // GFX11: v_cmp_lt_u16_e64
 
-v_cmp_lt_u16 vcc_lo, v1, v255
+v_cmp_lt_u16 vcc, v1, v255
 // GFX11: v_cmp_lt_u16_e64
 
-v_cmp_lt_u16 vcc_lo, v127, v255
+v_cmp_lt_u16 vcc, v127, v255
 // GFX11: v_cmp_lt_u16_e64
 
-v_cmp_lt_u16 vcc_lo, vcc_hi, v255
+v_cmp_lt_u16 vcc, vcc_hi, v255
 // GFX11: v_cmp_lt_u16_e64
 
-v_cmp_lt_u16 vcc_lo, vcc_lo, v255
+v_cmp_lt_u16 vcc, vcc_lo, v255
 // GFX11: v_cmp_lt_u16_e64
 
 v_cmp_ne_i16 vcc, v1, v255
@@ -441,16 +441,16 @@ v_cmp_ne_i16 vcc, vcc_hi, v255
 v_cmp_ne_i16 vcc, vcc_lo, v255
 // GFX11: v_cmp_ne_i16_e64
 
-v_cmp_ne_i16 vcc_lo, v1, v255
+v_cmp_ne_i16 vcc, v1, v255
 // GFX11: v_cmp_ne_i16_e64
 
-v_cmp_ne_i16 vcc_lo, v127, v255
+v_cmp_ne_i16 vcc, v127, v255
 // GFX11: v_cmp_ne_i16_e64
 
-v_cmp_ne_i16 vcc_lo, vcc_hi, v255
+v_cmp_ne_i16 vcc, vcc_hi, v255
 // GFX11: v_cmp_ne_i16_e64
 
-v_cmp_ne_i16 vcc_lo, vcc_lo, v255
+v_cmp_ne_i16 vcc, vcc_lo, v255
 // GFX11: v_cmp_ne_i16_e64
 
 v_cmp_ne_u16 vcc, v1, v255
@@ -465,16 +465,16 @@ v_cmp_ne_u16 vcc, vcc_hi, v255
 v_cmp_ne_u16 vcc, vcc_lo, v255
 // GFX11: v_cmp_ne_u16_e64
 
-v_cmp_ne_u16 vcc_lo, v1, v255
+v_cmp_ne_u16 vcc, v1, v255
 // GFX11: v_cmp_ne_u16_e64
 
-v_cmp_ne_u16 vcc_lo, v127, v255
+v_cmp_ne_u16 vcc, v127, v255
 // GFX11: v_cmp_ne_u16_e64
 
-v_cmp_ne_u16 vcc_lo, vcc_hi, v255
+v_cmp_ne_u16 vcc, vcc_hi, v255
 // GFX11: v_cmp_ne_u16_e64
 
-v_cmp_ne_u16 vcc_lo, vcc_lo, v255
+v_cmp_ne_u16 vcc, vcc_lo, v255
 // GFX11: v_cmp_ne_u16_e64
 
 v_cmp_neq_f16 vcc, v1, v255
@@ -489,16 +489,16 @@ v_cmp_neq_f16 vcc, vcc_hi, v255
 v_cmp_neq_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_neq_f16_e64
 
-v_cmp_neq_f16 vcc_lo, v1, v255
+v_cmp_neq_f16 vcc, v1, v255
 // GFX11: v_cmp_neq_f16_e64
 
-v_cmp_neq_f16 vcc_lo, v127, v255
+v_cmp_neq_f16 vcc, v127, v255
 // GFX11: v_cmp_neq_f16_e64
 
-v_cmp_neq_f16 vcc_lo, vcc_hi, v255
+v_cmp_neq_f16 vcc, vcc_hi, v255
 // GFX11: v_cmp_neq_f16_e64
 
-v_cmp_neq_f16 vcc_lo, vcc_lo, v255
+v_cmp_neq_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_neq_f16_e64
 
 v_cmp_nge_f16 vcc, v1, v255
@@ -513,16 +513,16 @@ v_cmp_nge_f16 vcc, vcc_hi, v255
 v_cmp_nge_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_nge_f16_e64
 
-v_cmp_nge_f16 vcc_lo, v1, v255
+v_cmp_nge_f16 vcc, v1, v255
 // GFX11: v_cmp_nge_f16_e64
 
-v_cmp_nge_f16 vcc_lo, v127, v255
+v_cmp_nge_f16 vcc, v127, v255
 // GFX11: v_cmp_nge_f16_e64
 
-v_cmp_nge_f16 vcc_lo, vcc_hi, v255
+v_cmp_nge_f16 vcc, vcc_hi, v255
 // GFX11: v_cmp_nge_f16_e64
 
-v_cmp_nge_f16 vcc_lo, vcc_lo, v255
+v_cmp_nge_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_nge_f16_e64
 
 v_cmp_ngt_f16 vcc, v1, v255
@@ -537,16 +537,16 @@ v_cmp_ngt_f16 vcc, vcc_hi, v255
 v_cmp_ngt_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_ngt_f16_e64
 
-v_cmp_ngt_f16 vcc_lo, v1, v255
+v_cmp_ngt_f16 vcc, v1, v255
 // GFX11: v_cmp_ngt_f16_e64
 
-v_cmp_ngt_f16 vcc_lo, v127, v255
+v_cmp_ngt_f16 vcc, v127, v255
 // GFX11: v_cmp_ngt_f16_e64
 
-v_cmp_ngt_f16 vcc_lo, vcc_hi, v255
+v_cmp_ngt_f16 vcc, vcc_hi, v255
 // GFX11: v_cmp_ngt_f16_e64
 
-v_cmp_ngt_f16 vcc_lo, vcc_lo, v255
+v_cmp_ngt_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_ngt_f16_e64
 
 v_cmp_nle_f16 vcc, v1, v255
@@ -561,16 +561,16 @@ v_cmp_nle_f16 vcc, vcc_hi, v255
 v_cmp_nle_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_nle_f16_e64
 
-v_cmp_nle_f16 vcc_lo, v1, v255
+v_cmp_nle_f16 vcc, v1, v255
 // GFX11: v_cmp_nle_f16_e64
 
-v_cmp_nle_f16 vcc_lo, v127, v255
+v_cmp_nle_f16 vcc, v127, v255
 // GFX11: v_cmp_nle_f16_e64
 
-v_cmp_nle_f16 vcc_lo, vcc_hi, v255
+v_cmp_nle_f16 vcc, vcc_hi, v255
 // GFX11: v_cmp_nle_f16_e64
 
-v_cmp_nle_f16 vcc_lo, vcc_lo, v255
+v_cmp_nle_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_nle_f16_e64
 
 v_cmp_nlg_f16 vcc, v1, v255
@@ -585,16 +585,16 @@ v_cmp_nlg_f16 vcc, vcc_hi, v255
 v_cmp_nlg_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_nlg_f16_e64
 
-v_cmp_nlg_f16 vcc_lo, v1, v255
+v_cmp_nlg_f16 vcc, v1, v255
 // GFX11: v_cmp_nlg_f16_e64
 
-v_cmp_nlg_f16 vcc_lo, v127, v255
+v_cmp_nlg_f16 vcc, v127, v255
 // GFX11: v_cmp_nlg_f16_e64
 
-v_cmp_nlg_f16 vcc_lo, vcc_hi, v255
+v_cmp_nlg_f16 vcc, vcc_hi, v255
 // GFX11: v_cmp_nlg_f16_e64
 
-v_cmp_nlg_f16 vcc_lo, vcc_lo, v255
+v_cmp_nlg_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_nlg_f16_e64
 
 v_cmp_nlt_f16 vcc, v1, v255
@@ -609,16 +609,16 @@ v_cmp_nlt_f16 vcc, vcc_hi, v255
 v_cmp_nlt_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_nlt_f16_e64
 
-v_cmp_nlt_f16 vcc_lo, v1, v255
+v_cmp_nlt_f16 vcc, v1, v255
 // GFX11: v_cmp_nlt_f16_e64
 
-v_cmp_nlt_f16 vcc_lo, v127, v255
+v_cmp_nlt_f16 vcc, v127, v255
 // GFX11: v_cmp_nlt_f16_e64
 
-v_cmp_nlt_f16 vcc_lo, vcc_hi, v255
+v_cmp_nlt_f16 vcc, vcc_hi, v255
 // GFX11: v_cmp_nlt_f16_e64
 
-v_cmp_nlt_f16 vcc_lo, vcc_lo, v255
+v_cmp_nlt_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_nlt_f16_e64
 
 v_cmp_o_f16 vcc, v1, v255
@@ -633,16 +633,16 @@ v_cmp_o_f16 vcc, vcc_hi, v255
 v_cmp_o_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_o_f16_e64
 
-v_cmp_o_f16 vcc_lo, v1, v255
+v_cmp_o_f16 vcc, v1, v255
 // GFX11: v_cmp_o_f16_e64
 
-v_cmp_o_f16 vcc_lo, v127, v255
+v_cmp_o_f16 vcc, v127, v255
 // GFX11: v_cmp_o_f16_e64
 
-v_cmp_o_f16 vcc_lo, vcc_hi, v255
+v_cmp_o_f16 vcc, vcc_hi, v255
 // GFX11: v_cmp_o_f16_e64
 
-v_cmp_o_f16 vcc_lo, vcc_lo, v255
+v_cmp_o_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_o_f16_e64
 
 v_cmp_t_f16 vcc, v1, v255
@@ -657,16 +657,16 @@ v_cmp_t_f16 vcc, vcc_hi, v255
 v_cmp_t_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_t_f16 vcc_lo, v1, v255
+v_cmp_t_f16 vcc, v1, v255
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_t_f16 vcc_lo, v127, v255
+v_cmp_t_f16 vcc, v127, v255
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_t_f16 vcc_lo, vcc_hi, v255
+v_cmp_t_f16 vcc, vcc_hi, v255
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_t_f16 vcc_lo, vcc_lo, v255
+v_cmp_t_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_t_f16_e64
 
 v_cmp_tru_f16 vcc, v1, v255
@@ -681,16 +681,16 @@ v_cmp_tru_f16 vcc, vcc_hi, v255
 v_cmp_tru_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_tru_f16 vcc_lo, v1, v255
+v_cmp_tru_f16 vcc, v1, v255
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_tru_f16 vcc_lo, v127, v255
+v_cmp_tru_f16 vcc, v127, v255
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_tru_f16 vcc_lo, vcc_hi, v255
+v_cmp_tru_f16 vcc, vcc_hi, v255
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_tru_f16 vcc_lo, vcc_lo, v255
+v_cmp_tru_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_t_f16_e64
 
 v_cmp_u_f16 vcc, v1, v255
@@ -705,196 +705,196 @@ v_cmp_u_f16 vcc, vcc_hi, v255
 v_cmp_u_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_u_f16_e64
 
-v_cmp_u_f16 vcc_lo, v1, v255
+v_cmp_u_f16 vcc, v1, v255
 // GFX11: v_cmp_u_f16_e64
 
-v_cmp_u_f16 vcc_lo, v127, v255
+v_cmp_u_f16 vcc, v127, v255
 // GFX11: v_cmp_u_f16_e64
 
-v_cmp_u_f16 vcc_lo, vcc_hi, v255
+v_cmp_u_f16 vcc, vcc_hi, v255
 // GFX11: v_cmp_u_f16_e64
 
-v_cmp_u_f16 vcc_lo, vcc_lo, v255
+v_cmp_u_f16 vcc, vcc_lo, v255
 // GFX11: v_cmp_u_f16_e64
 
 v_cmp_class_f16 vcc, v128, v2
 // GFX11: v_cmp_class_f16_e64
 
-v_cmp_class_f16 vcc_lo, v128, v2
+v_cmp_class_f16 vcc, v128, v2
 // GFX11: v_cmp_class_f16_e64
 
 v_cmp_eq_f16 vcc, v128, v2
 // GFX11: v_cmp_eq_f16_e64
 
-v_cmp_eq_f16 vcc_lo, v128, v2
+v_cmp_eq_f16 vcc, v128, v2
 // GFX11: v_cmp_eq_f16_e64
 
 v_cmp_eq_i16 vcc, v128, v2
 // GFX11: v_cmp_eq_i16_e64
 
-v_cmp_eq_i16 vcc_lo, v128, v2
+v_cmp_eq_i16 vcc, v128, v2
 // GFX11: v_cmp_eq_i16_e64
 
 v_cmp_eq_u16 vcc, v128, v2
 // GFX11: v_cmp_eq_u16_e64
 
-v_cmp_eq_u16 vcc_lo, v128, v2
+v_cmp_eq_u16 vcc, v128, v2
 // GFX11: v_cmp_eq_u16_e64
 
 v_cmp_f_f16 vcc, v128, v2
 // GFX11: v_cmp_f_f16_e64
 
-v_cmp_f_f16 vcc_lo, v128, v2
+v_cmp_f_f16 vcc, v128, v2
 // GFX11: v_cmp_f_f16_e64
 
 v_cmp_ge_f16 vcc, v128, v2
 // GFX11: v_cmp_ge_f16_e64
 
-v_cmp_ge_f16 vcc_lo, v128, v2
+v_cmp_ge_f16 vcc, v128, v2
 // GFX11: v_cmp_ge_f16_e64
 
 v_cmp_ge_i16 vcc, v128, v2
 // GFX11: v_cmp_ge_i16_e64
 
-v_cmp_ge_i16 vcc_lo, v128, v2
+v_cmp_ge_i16 vcc, v128, v2
 // GFX11: v_cmp_ge_i16_e64
 
 v_cmp_ge_u16 vcc, v128, v2
 // GFX11: v_cmp_ge_u16_e64
 
-v_cmp_ge_u16 vcc_lo, v128, v2
+v_cmp_ge_u16 vcc, v128, v2
 // GFX11: v_cmp_ge_u16_e64
 
 v_cmp_gt_f16 vcc, v128, v2
 // GFX11: v_cmp_gt_f16_e64
 
-v_cmp_gt_f16 vcc_lo, v128, v2
+v_cmp_gt_f16 vcc, v128, v2
 // GFX11: v_cmp_gt_f16_e64
 
 v_cmp_gt_i16 vcc, v128, v2
 // GFX11: v_cmp_gt_i16_e64
 
-v_cmp_gt_i16 vcc_lo, v128, v2
+v_cmp_gt_i16 vcc, v128, v2
 // GFX11: v_cmp_gt_i16_e64
 
 v_cmp_gt_u16 vcc, v128, v2
 // GFX11: v_cmp_gt_u16_e64
 
-v_cmp_gt_u16 vcc_lo, v128, v2
+v_cmp_gt_u16 vcc, v128, v2
 // GFX11: v_cmp_gt_u16_e64
 
 v_cmp_le_f16 vcc, v128, v2
 // GFX11: v_cmp_le_f16_e64
 
-v_cmp_le_f16 vcc_lo, v128, v2
+v_cmp_le_f16 vcc, v128, v2
 // GFX11: v_cmp_le_f16_e64
 
 v_cmp_le_i16 vcc, v128, v2
 // GFX11: v_cmp_le_i16_e64
 
-v_cmp_le_i16 vcc_lo, v128, v2
+v_cmp_le_i16 vcc, v128, v2
 // GFX11: v_cmp_le_i16_e64
 
 v_cmp_le_u16 vcc, v128, v2
 // GFX11: v_cmp_le_u16_e64
 
-v_cmp_le_u16 vcc_lo, v128, v2
+v_cmp_le_u16 vcc, v128, v2
 // GFX11: v_cmp_le_u16_e64
 
 v_cmp_lg_f16 vcc, v128, v2
 // GFX11: v_cmp_lg_f16_e64
 
-v_cmp_lg_f16 vcc_lo, v128, v2
+v_cmp_lg_f16 vcc, v128, v2
 // GFX11: v_cmp_lg_f16_e64
 
 v_cmp_lt_f16 vcc, v128, v2
 // GFX11: v_cmp_lt_f16_e64
 
-v_cmp_lt_f16 vcc_lo, v128, v2
+v_cmp_lt_f16 vcc, v128, v2
 // GFX11: v_cmp_lt_f16_e64
 
 v_cmp_lt_i16 vcc, v128, v2
 // GFX11: v_cmp_lt_i16_e64
 
-v_cmp_lt_i16 vcc_lo, v128, v2
+v_cmp_lt_i16 vcc, v128, v2
 // GFX11: v_cmp_lt_i16_e64
 
 v_cmp_lt_u16 vcc, v128, v2
 // GFX11: v_cmp_lt_u16_e64
 
-v_cmp_lt_u16 vcc_lo, v128, v2
+v_cmp_lt_u16 vcc, v128, v2
 // GFX11: v_cmp_lt_u16_e64
 
 v_cmp_ne_i16 vcc, v128, v2
 // GFX11: v_cmp_ne_i16_e64
 
-v_cmp_ne_i16 vcc_lo, v128, v2
+v_cmp_ne_i16 vcc, v128, v2
 // GFX11: v_cmp_ne_i16_e64
 
 v_cmp_ne_u16 vcc, v128, v2
 // GFX11: v_cmp_ne_u16_e64
 
-v_cmp_ne_u16 vcc_lo, v128, v2
+v_cmp_ne_u16 vcc, v128, v2
 // GFX11: v_cmp_ne_u16_e64
 
 v_cmp_neq_f16 vcc, v128, v2
 // GFX11: v_cmp_neq_f16_e64
 
-v_cmp_neq_f16 vcc_lo, v128, v2
+v_cmp_neq_f16 vcc, v128, v2
 // GFX11: v_cmp_neq_f16_e64
 
 v_cmp_nge_f16 vcc, v128, v2
 // GFX11: v_cmp_nge_f16_e64
 
-v_cmp_nge_f16 vcc_lo, v128, v2
+v_cmp_nge_f16 vcc, v128, v2
 // GFX11: v_cmp_nge_f16_e64
 
 v_cmp_ngt_f16 vcc, v128, v2
 // GFX11: v_cmp_ngt_f16_e64
 
-v_cmp_ngt_f16 vcc_lo, v128, v2
+v_cmp_ngt_f16 vcc, v128, v2
 // GFX11: v_cmp_ngt_f16_e64
 
 v_cmp_nle_f16 vcc, v128, v2
 // GFX11: v_cmp_nle_f16_e64
 
-v_cmp_nle_f16 vcc_lo, v128, v2
+v_cmp_nle_f16 vcc, v128, v2
 // GFX11: v_cmp_nle_f16_e64
 
 v_cmp_nlg_f16 vcc, v128, v2
 // GFX11: v_cmp_nlg_f16_e64
 
-v_cmp_nlg_f16 vcc_lo, v128, v2
+v_cmp_nlg_f16 vcc, v128, v2
 // GFX11: v_cmp_nlg_f16_e64
 
 v_cmp_nlt_f16 vcc, v128, v2
 // GFX11: v_cmp_nlt_f16_e64
 
-v_cmp_nlt_f16 vcc_lo, v128, v2
+v_cmp_nlt_f16 vcc, v128, v2
 // GFX11: v_cmp_nlt_f16_e64
 
 v_cmp_o_f16 vcc, v128, v2
 // GFX11: v_cmp_o_f16_e64
 
-v_cmp_o_f16 vcc_lo, v128, v2
+v_cmp_o_f16 vcc, v128, v2
 // GFX11: v_cmp_o_f16_e64
 
 v_cmp_t_f16 vcc, v128, v2
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_t_f16 vcc_lo, v128, v2
+v_cmp_t_f16 vcc, v128, v2
 // GFX11: v_cmp_t_f16_e64
 
 v_cmp_tru_f16 vcc, v128, v2
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_tru_f16 vcc_lo, v128, v2
+v_cmp_tru_f16 vcc, v128, v2
 // GFX11: v_cmp_t_f16_e64
 
 v_cmp_u_f16 vcc, v128, v2
 // GFX11: v_cmp_u_f16_e64
 
-v_cmp_u_f16 vcc_lo, v128, v2
+v_cmp_u_f16 vcc, v128, v2
 // GFX11: v_cmp_u_f16_e64
 
 v_cmp_class_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -903,7 +903,7 @@ v_cmp_class_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_class_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_class_f16_e64
 
-v_cmp_class_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_class_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_class_f16_e64
 
 v_cmp_eq_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -912,10 +912,10 @@ v_cmp_eq_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_eq_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_eq_f16_e64
 
-v_cmp_eq_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_eq_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_eq_f16_e64
 
-v_cmp_eq_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_eq_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_eq_f16_e64
 
 v_cmp_eq_i16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -924,10 +924,10 @@ v_cmp_eq_i16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_eq_i16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_eq_i16_e64
 
-v_cmp_eq_i16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_eq_i16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_eq_i16_e64
 
-v_cmp_eq_i16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_eq_i16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_eq_i16_e64
 
 v_cmp_eq_u16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -936,10 +936,10 @@ v_cmp_eq_u16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_eq_u16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_eq_u16_e64
 
-v_cmp_eq_u16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_eq_u16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_eq_u16_e64
 
-v_cmp_eq_u16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_eq_u16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_eq_u16_e64
 
 v_cmp_f_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -948,10 +948,10 @@ v_cmp_f_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_f_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_f_f16_e64
 
-v_cmp_f_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_f_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_f_f16_e64
 
-v_cmp_f_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_f_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_f_f16_e64
 
 v_cmp_ge_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -960,10 +960,10 @@ v_cmp_ge_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_ge_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ge_f16_e64
 
-v_cmp_ge_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_ge_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ge_f16_e64
 
-v_cmp_ge_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_ge_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ge_f16_e64
 
 v_cmp_ge_i16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -972,10 +972,10 @@ v_cmp_ge_i16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_ge_i16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ge_i16_e64
 
-v_cmp_ge_i16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_ge_i16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ge_i16_e64
 
-v_cmp_ge_i16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_ge_i16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ge_i16_e64
 
 v_cmp_ge_u16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -984,10 +984,10 @@ v_cmp_ge_u16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_ge_u16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ge_u16_e64
 
-v_cmp_ge_u16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_ge_u16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ge_u16_e64
 
-v_cmp_ge_u16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_ge_u16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ge_u16_e64
 
 v_cmp_gt_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -996,10 +996,10 @@ v_cmp_gt_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_gt_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_gt_f16_e64
 
-v_cmp_gt_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_gt_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_gt_f16_e64
 
-v_cmp_gt_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_gt_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_gt_f16_e64
 
 v_cmp_gt_i16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1008,10 +1008,10 @@ v_cmp_gt_i16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_gt_i16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_gt_i16_e64
 
-v_cmp_gt_i16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_gt_i16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_gt_i16_e64
 
-v_cmp_gt_i16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_gt_i16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_gt_i16_e64
 
 v_cmp_gt_u16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1020,10 +1020,10 @@ v_cmp_gt_u16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_gt_u16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_gt_u16_e64
 
-v_cmp_gt_u16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_gt_u16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_gt_u16_e64
 
-v_cmp_gt_u16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_gt_u16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_gt_u16_e64
 
 v_cmp_le_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1032,10 +1032,10 @@ v_cmp_le_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_le_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_le_f16_e64
 
-v_cmp_le_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_le_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_le_f16_e64
 
-v_cmp_le_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_le_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_le_f16_e64
 
 v_cmp_le_i16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1044,10 +1044,10 @@ v_cmp_le_i16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_le_i16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_le_i16_e64
 
-v_cmp_le_i16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_le_i16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_le_i16_e64
 
-v_cmp_le_i16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_le_i16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_le_i16_e64
 
 v_cmp_le_u16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1056,10 +1056,10 @@ v_cmp_le_u16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_le_u16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_le_u16_e64
 
-v_cmp_le_u16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_le_u16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_le_u16_e64
 
-v_cmp_le_u16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_le_u16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_le_u16_e64
 
 v_cmp_lg_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1068,10 +1068,10 @@ v_cmp_lg_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_lg_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lg_f16_e64
 
-v_cmp_lg_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_lg_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lg_f16_e64
 
-v_cmp_lg_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_lg_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lg_f16_e64
 
 v_cmp_lt_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1080,10 +1080,10 @@ v_cmp_lt_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_lt_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lt_f16_e64
 
-v_cmp_lt_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_lt_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lt_f16_e64
 
-v_cmp_lt_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_lt_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lt_f16_e64
 
 v_cmp_lt_i16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1092,10 +1092,10 @@ v_cmp_lt_i16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_lt_i16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lt_i16_e64
 
-v_cmp_lt_i16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_lt_i16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lt_i16_e64
 
-v_cmp_lt_i16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_lt_i16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lt_i16_e64
 
 v_cmp_lt_u16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1104,10 +1104,10 @@ v_cmp_lt_u16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_lt_u16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lt_u16_e64
 
-v_cmp_lt_u16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_lt_u16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lt_u16_e64
 
-v_cmp_lt_u16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_lt_u16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lt_u16_e64
 
 v_cmp_ne_i16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1116,10 +1116,10 @@ v_cmp_ne_i16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_ne_i16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ne_i16_e64
 
-v_cmp_ne_i16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_ne_i16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ne_i16_e64
 
-v_cmp_ne_i16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_ne_i16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ne_i16_e64
 
 v_cmp_ne_u16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1128,10 +1128,10 @@ v_cmp_ne_u16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_ne_u16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ne_u16_e64
 
-v_cmp_ne_u16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_ne_u16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ne_u16_e64
 
-v_cmp_ne_u16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_ne_u16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ne_u16_e64
 
 v_cmp_neq_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1140,10 +1140,10 @@ v_cmp_neq_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_neq_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_neq_f16_e64
 
-v_cmp_neq_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_neq_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_neq_f16_e64
 
-v_cmp_neq_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_neq_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_neq_f16_e64
 
 v_cmp_nge_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1152,10 +1152,10 @@ v_cmp_nge_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_nge_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nge_f16_e64
 
-v_cmp_nge_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_nge_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nge_f16_e64
 
-v_cmp_nge_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_nge_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nge_f16_e64
 
 v_cmp_ngt_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1164,10 +1164,10 @@ v_cmp_ngt_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_ngt_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ngt_f16_e64
 
-v_cmp_ngt_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_ngt_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ngt_f16_e64
 
-v_cmp_ngt_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_ngt_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ngt_f16_e64
 
 v_cmp_nle_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1176,10 +1176,10 @@ v_cmp_nle_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_nle_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nle_f16_e64
 
-v_cmp_nle_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_nle_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nle_f16_e64
 
-v_cmp_nle_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_nle_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nle_f16_e64
 
 v_cmp_nlg_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1188,10 +1188,10 @@ v_cmp_nlg_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_nlg_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nlg_f16_e64
 
-v_cmp_nlg_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_nlg_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nlg_f16_e64
 
-v_cmp_nlg_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_nlg_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nlg_f16_e64
 
 v_cmp_nlt_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1200,10 +1200,10 @@ v_cmp_nlt_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_nlt_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nlt_f16_e64
 
-v_cmp_nlt_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_nlt_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nlt_f16_e64
 
-v_cmp_nlt_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_nlt_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nlt_f16_e64
 
 v_cmp_o_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1212,10 +1212,10 @@ v_cmp_o_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_o_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_o_f16_e64
 
-v_cmp_o_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_o_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_o_f16_e64
 
-v_cmp_o_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_o_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_o_f16_e64
 
 v_cmp_t_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1224,10 +1224,10 @@ v_cmp_t_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_t_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_t_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_t_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_t_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_t_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
 v_cmp_tru_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1236,10 +1236,10 @@ v_cmp_tru_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_tru_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_tru_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_tru_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_tru_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_tru_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
 v_cmp_u_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
@@ -1248,190 +1248,190 @@ v_cmp_u_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 v_cmp_u_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_u_f16_e64
 
-v_cmp_u_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0]
+v_cmp_u_f16 vcc, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_u_f16_e64
 
-v_cmp_u_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0]
+v_cmp_u_f16 vcc, v127, v255 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_u_f16_e64
 
 v_cmp_class_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_class_f16_e64
 
-v_cmp_class_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_class_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_class_f16_e64
 
 v_cmp_eq_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_eq_f16_e64
 
-v_cmp_eq_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_eq_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_eq_f16_e64
 
 v_cmp_eq_i16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_eq_i16_e64
 
-v_cmp_eq_i16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_eq_i16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_eq_i16_e64
 
 v_cmp_eq_u16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_eq_u16_e64
 
-v_cmp_eq_u16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_eq_u16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_eq_u16_e64
 
 v_cmp_f_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_f_f16_e64
 
-v_cmp_f_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_f_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_f_f16_e64
 
 v_cmp_ge_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ge_f16_e64
 
-v_cmp_ge_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_ge_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ge_f16_e64
 
 v_cmp_ge_i16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ge_i16_e64
 
-v_cmp_ge_i16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_ge_i16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ge_i16_e64
 
 v_cmp_ge_u16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ge_u16_e64
 
-v_cmp_ge_u16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_ge_u16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ge_u16_e64
 
 v_cmp_gt_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_gt_f16_e64
 
-v_cmp_gt_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_gt_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_gt_f16_e64
 
 v_cmp_gt_i16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_gt_i16_e64
 
-v_cmp_gt_i16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_gt_i16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_gt_i16_e64
 
 v_cmp_gt_u16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_gt_u16_e64
 
-v_cmp_gt_u16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_gt_u16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_gt_u16_e64
 
 v_cmp_le_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_le_f16_e64
 
-v_cmp_le_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_le_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_le_f16_e64
 
 v_cmp_le_i16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_le_i16_e64
 
-v_cmp_le_i16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_le_i16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_le_i16_e64
 
 v_cmp_le_u16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_le_u16_e64
 
-v_cmp_le_u16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_le_u16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_le_u16_e64
 
 v_cmp_lg_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lg_f16_e64
 
-v_cmp_lg_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_lg_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lg_f16_e64
 
 v_cmp_lt_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lt_f16_e64
 
-v_cmp_lt_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_lt_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lt_f16_e64
 
 v_cmp_lt_i16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lt_i16_e64
 
-v_cmp_lt_i16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_lt_i16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lt_i16_e64
 
 v_cmp_lt_u16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lt_u16_e64
 
-v_cmp_lt_u16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_lt_u16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_lt_u16_e64
 
 v_cmp_ne_i16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ne_i16_e64
 
-v_cmp_ne_i16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_ne_i16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ne_i16_e64
 
 v_cmp_ne_u16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ne_u16_e64
 
-v_cmp_ne_u16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_ne_u16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ne_u16_e64
 
 v_cmp_neq_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_neq_f16_e64
 
-v_cmp_neq_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_neq_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_neq_f16_e64
 
 v_cmp_nge_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nge_f16_e64
 
-v_cmp_nge_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_nge_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nge_f16_e64
 
 v_cmp_ngt_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ngt_f16_e64
 
-v_cmp_ngt_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_ngt_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_ngt_f16_e64
 
 v_cmp_nle_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nle_f16_e64
 
-v_cmp_nle_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_nle_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nle_f16_e64
 
 v_cmp_nlg_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nlg_f16_e64
 
-v_cmp_nlg_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_nlg_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nlg_f16_e64
 
 v_cmp_nlt_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nlt_f16_e64
 
-v_cmp_nlt_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_nlt_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_nlt_f16_e64
 
 v_cmp_o_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_o_f16_e64
 
-v_cmp_o_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_o_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_o_f16_e64
 
 v_cmp_t_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_t_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_t_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
 v_cmp_tru_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_tru_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_tru_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
 v_cmp_u_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_u_f16_e64
 
-v_cmp_u_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0]
+v_cmp_u_f16 vcc, v128, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmp_u_f16_e64
 
 v_cmp_class_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1440,7 +1440,7 @@ v_cmp_class_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_class_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_class_f16_e64
 
-v_cmp_class_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_class_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_class_f16_e64
 
 v_cmp_eq_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1449,10 +1449,10 @@ v_cmp_eq_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_eq_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_eq_f16_e64
 
-v_cmp_eq_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_eq_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_eq_f16_e64
 
-v_cmp_eq_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_eq_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_eq_f16_e64
 
 v_cmp_eq_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1461,10 +1461,10 @@ v_cmp_eq_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_eq_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_eq_i16_e64
 
-v_cmp_eq_i16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_eq_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_eq_i16_e64
 
-v_cmp_eq_i16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_eq_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_eq_i16_e64
 
 v_cmp_eq_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1473,10 +1473,10 @@ v_cmp_eq_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_eq_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_eq_u16_e64
 
-v_cmp_eq_u16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_eq_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_eq_u16_e64
 
-v_cmp_eq_u16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_eq_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_eq_u16_e64
 
 v_cmp_f_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1485,10 +1485,10 @@ v_cmp_f_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_f_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_f_f16_e64
 
-v_cmp_f_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_f_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_f_f16_e64
 
-v_cmp_f_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_f_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_f_f16_e64
 
 v_cmp_ge_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1497,10 +1497,10 @@ v_cmp_ge_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_ge_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ge_f16_e64
 
-v_cmp_ge_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_ge_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ge_f16_e64
 
-v_cmp_ge_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_ge_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ge_f16_e64
 
 v_cmp_ge_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1509,10 +1509,10 @@ v_cmp_ge_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_ge_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ge_i16_e64
 
-v_cmp_ge_i16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_ge_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ge_i16_e64
 
-v_cmp_ge_i16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_ge_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ge_i16_e64
 
 v_cmp_ge_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1521,10 +1521,10 @@ v_cmp_ge_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_ge_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ge_u16_e64
 
-v_cmp_ge_u16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_ge_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ge_u16_e64
 
-v_cmp_ge_u16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_ge_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ge_u16_e64
 
 v_cmp_gt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1533,10 +1533,10 @@ v_cmp_gt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_gt_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_gt_f16_e64
 
-v_cmp_gt_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_gt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_gt_f16_e64
 
-v_cmp_gt_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_gt_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_gt_f16_e64
 
 v_cmp_gt_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1545,10 +1545,10 @@ v_cmp_gt_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_gt_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_gt_i16_e64
 
-v_cmp_gt_i16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_gt_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_gt_i16_e64
 
-v_cmp_gt_i16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_gt_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_gt_i16_e64
 
 v_cmp_gt_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1557,10 +1557,10 @@ v_cmp_gt_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_gt_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_gt_u16_e64
 
-v_cmp_gt_u16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_gt_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_gt_u16_e64
 
-v_cmp_gt_u16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_gt_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_gt_u16_e64
 
 v_cmp_le_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1569,10 +1569,10 @@ v_cmp_le_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_le_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_le_f16_e64
 
-v_cmp_le_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_le_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_le_f16_e64
 
-v_cmp_le_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_le_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_le_f16_e64
 
 v_cmp_le_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1581,10 +1581,10 @@ v_cmp_le_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_le_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_le_i16_e64
 
-v_cmp_le_i16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_le_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_le_i16_e64
 
-v_cmp_le_i16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_le_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_le_i16_e64
 
 v_cmp_le_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1593,10 +1593,10 @@ v_cmp_le_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_le_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_le_u16_e64
 
-v_cmp_le_u16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_le_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_le_u16_e64
 
-v_cmp_le_u16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_le_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_le_u16_e64
 
 v_cmp_lg_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1605,10 +1605,10 @@ v_cmp_lg_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_lg_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lg_f16_e64
 
-v_cmp_lg_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_lg_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lg_f16_e64
 
-v_cmp_lg_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_lg_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lg_f16_e64
 
 v_cmp_lt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1617,10 +1617,10 @@ v_cmp_lt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_lt_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lt_f16_e64
 
-v_cmp_lt_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_lt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lt_f16_e64
 
-v_cmp_lt_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_lt_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lt_f16_e64
 
 v_cmp_lt_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1629,10 +1629,10 @@ v_cmp_lt_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_lt_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lt_i16_e64
 
-v_cmp_lt_i16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_lt_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lt_i16_e64
 
-v_cmp_lt_i16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_lt_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lt_i16_e64
 
 v_cmp_lt_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1641,10 +1641,10 @@ v_cmp_lt_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_lt_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lt_u16_e64
 
-v_cmp_lt_u16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_lt_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lt_u16_e64
 
-v_cmp_lt_u16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_lt_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lt_u16_e64
 
 v_cmp_ne_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1653,10 +1653,10 @@ v_cmp_ne_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_ne_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ne_i16_e64
 
-v_cmp_ne_i16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_ne_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ne_i16_e64
 
-v_cmp_ne_i16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_ne_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ne_i16_e64
 
 v_cmp_ne_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1665,10 +1665,10 @@ v_cmp_ne_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_ne_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ne_u16_e64
 
-v_cmp_ne_u16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_ne_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ne_u16_e64
 
-v_cmp_ne_u16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_ne_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ne_u16_e64
 
 v_cmp_neq_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1677,10 +1677,10 @@ v_cmp_neq_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_neq_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_neq_f16_e64
 
-v_cmp_neq_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_neq_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_neq_f16_e64
 
-v_cmp_neq_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_neq_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_neq_f16_e64
 
 v_cmp_nge_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1689,10 +1689,10 @@ v_cmp_nge_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_nge_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nge_f16_e64
 
-v_cmp_nge_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_nge_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nge_f16_e64
 
-v_cmp_nge_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_nge_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nge_f16_e64
 
 v_cmp_ngt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1701,10 +1701,10 @@ v_cmp_ngt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_ngt_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ngt_f16_e64
 
-v_cmp_ngt_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_ngt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ngt_f16_e64
 
-v_cmp_ngt_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_ngt_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ngt_f16_e64
 
 v_cmp_nle_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1713,10 +1713,10 @@ v_cmp_nle_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_nle_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nle_f16_e64
 
-v_cmp_nle_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_nle_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nle_f16_e64
 
-v_cmp_nle_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_nle_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nle_f16_e64
 
 v_cmp_nlg_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1725,10 +1725,10 @@ v_cmp_nlg_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_nlg_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nlg_f16_e64
 
-v_cmp_nlg_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_nlg_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nlg_f16_e64
 
-v_cmp_nlg_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_nlg_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nlg_f16_e64
 
 v_cmp_nlt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1737,10 +1737,10 @@ v_cmp_nlt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_nlt_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nlt_f16_e64
 
-v_cmp_nlt_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_nlt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nlt_f16_e64
 
-v_cmp_nlt_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_nlt_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nlt_f16_e64
 
 v_cmp_o_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1749,10 +1749,10 @@ v_cmp_o_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_o_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_o_f16_e64
 
-v_cmp_o_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_o_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_o_f16_e64
 
-v_cmp_o_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_o_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_o_f16_e64
 
 v_cmp_t_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1761,10 +1761,10 @@ v_cmp_t_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_t_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_t_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_t_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_t_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_t_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
 v_cmp_tru_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1773,10 +1773,10 @@ v_cmp_tru_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_tru_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_tru_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_tru_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_tru_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_tru_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
 v_cmp_u_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
@@ -1785,189 +1785,189 @@ v_cmp_u_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_cmp_u_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_u_f16_e64
 
-v_cmp_u_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_u_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_u_f16_e64
 
-v_cmp_u_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_u_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_u_f16_e64
 
 v_cmp_class_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_class_f16_e64
 
-v_cmp_class_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_class_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_class_f16_e64
 
 v_cmp_eq_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_eq_f16_e64
 
-v_cmp_eq_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_eq_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_eq_f16_e64
 
 v_cmp_eq_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_eq_i16_e64
 
-v_cmp_eq_i16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_eq_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_eq_i16_e64
 
 v_cmp_eq_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_eq_u16_e64
 
-v_cmp_eq_u16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_eq_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_eq_u16_e64
 
 v_cmp_f_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_f_f16_e64
 
-v_cmp_f_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_f_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_f_f16_e64
 
 v_cmp_ge_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ge_f16_e64
 
-v_cmp_ge_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_ge_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ge_f16_e64
 
 v_cmp_ge_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ge_i16_e64
 
-v_cmp_ge_i16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_ge_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ge_i16_e64
 
 v_cmp_ge_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ge_u16_e64
 
-v_cmp_ge_u16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_ge_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ge_u16_e64
 
 v_cmp_gt_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_gt_f16_e64
 
-v_cmp_gt_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_gt_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_gt_f16_e64
 
 v_cmp_gt_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_gt_i16_e64
 
-v_cmp_gt_i16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_gt_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_gt_i16_e64
 
 v_cmp_gt_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_gt_u16_e64
 
-v_cmp_gt_u16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_gt_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_gt_u16_e64
 
 v_cmp_le_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_le_f16_e64
 
-v_cmp_le_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_le_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_le_f16_e64
 
 v_cmp_le_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_le_i16_e64
 
-v_cmp_le_i16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_le_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_le_i16_e64
 
 v_cmp_le_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_le_u16_e64
 
-v_cmp_le_u16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_le_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_le_u16_e64
 
 v_cmp_lg_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lg_f16_e64
 
-v_cmp_lg_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_lg_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lg_f16_e64
 
 v_cmp_lt_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lt_f16_e64
 
-v_cmp_lt_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_lt_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lt_f16_e64
 
 v_cmp_lt_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lt_i16_e64
 
-v_cmp_lt_i16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_lt_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lt_i16_e64
 
 v_cmp_lt_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lt_u16_e64
 
-v_cmp_lt_u16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_lt_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_lt_u16_e64
 
 v_cmp_ne_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ne_i16_e64
 
-v_cmp_ne_i16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_ne_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ne_i16_e64
 
 v_cmp_ne_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ne_u16_e64
 
-v_cmp_ne_u16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_ne_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ne_u16_e64
 
 v_cmp_neq_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_neq_f16_e64
 
-v_cmp_neq_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_neq_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_neq_f16_e64
 
 v_cmp_nge_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nge_f16_e64
 
-v_cmp_nge_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_nge_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nge_f16_e64
 
 v_cmp_ngt_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ngt_f16_e64
 
-v_cmp_ngt_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_ngt_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_ngt_f16_e64
 
 v_cmp_nle_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nle_f16_e64
 
-v_cmp_nle_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_nle_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nle_f16_e64
 
 v_cmp_nlg_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nlg_f16_e64
 
-v_cmp_nlg_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_nlg_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nlg_f16_e64
 
 v_cmp_nlt_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nlt_f16_e64
 
-v_cmp_nlt_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_nlt_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_nlt_f16_e64
 
 v_cmp_o_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_o_f16_e64
 
-v_cmp_o_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_o_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_o_f16_e64
 
 v_cmp_t_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_t_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_t_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
 v_cmp_tru_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
-v_cmp_tru_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_tru_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_t_f16_e64
 
 v_cmp_u_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_u_f16_e64
 
-v_cmp_u_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmp_u_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmp_u_f16_e64
 
diff --git a/llvm/test/MC/AMDGPU/wave32.s b/llvm/test/MC/AMDGPU/wave32.s
index c52693076e2c5..25bb4fd84433b 100644
--- a/llvm/test/MC/AMDGPU/wave32.s
+++ b/llvm/test/MC/AMDGPU/wave32.s
@@ -1,7 +1,7 @@
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck -check-prefix=GFX1032 %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck -check-prefix=GFX1064 %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck -check-prefix=GFX1032-ERR --implicit-check-not=error: %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck -check-prefix=GFX1064-ERR --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck -check-prefix=GFX1032 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck -check-prefix=GFX1064 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck -check-prefix=GFX1032-ERR --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck -check-prefix=GFX1064-ERR --implicit-check-not=error: %s
 
 v_cmp_ge_i32_e32 s0, v0
 // GFX1032: v_cmp_ge_i32_e32 vcc_lo, s0, v0 ; encoding: [0x00,0x00,0x0c,0x7d]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt
index 78ca1bbdacf29..31fc10174bb0b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt
@@ -1,5 +1,5 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX1032 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64,-wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX1064 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX1032 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX1064 %s
 
 # GFX1032:   v_cmp_lt_f32_e32 vcc_lo, s2, v4
 # GFX1064:   v_cmp_lt_f32_e32 vcc, s2, v4

>From 55dda8d68c366fa876a549a8e3b7778aaa787161 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 11 Jul 2024 00:57:39 -0700
Subject: [PATCH 2/2] Just add FeatureWavefrontSize32 unconditionally

Older targets just have FeatureWavefrontSize64 in their definitions.
---
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp            | 11 +++++------
 llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp  | 11 +++++------
 .../Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp |  8 ++++----
 3 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index a59893d3cf85d..929d75f2d0757 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -106,12 +106,11 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
   }
 
   if (!hasFeature(AMDGPU::FeatureWavefrontSize32) &&
-      !hasFeature(AMDGPU::FeatureWavefrontSize64)) {
-    if (getGeneration() >= AMDGPUSubtarget::GFX10)
-      ToggleFeature(AMDGPU::FeatureWavefrontSize32);
-    else
-      ToggleFeature(AMDGPU::FeatureWavefrontSize64);
-  }
+      !hasFeature(AMDGPU::FeatureWavefrontSize64))
+    // If there is no default wave size it must be a generation before gfx9,
+    // these have FeatureWavefrontSize64 in their definition already. For gfx10+
+    // set wave32 as a default.
+    ToggleFeature(AMDGPU::FeatureWavefrontSize32);
 
   // We don't support FP64 for EG/NI atm.
   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 1c3925cfad464..0a2fd21bd937b 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1411,12 +1411,11 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
     AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
     FeatureBitset FB = getFeatureBits();
     if (!FB[AMDGPU::FeatureWavefrontSize64] &&
-        !FB[AMDGPU::FeatureWavefrontSize32]) {
-      if (ISA.Major >= 10)
-        copySTI().ToggleFeature(AMDGPU::FeatureWavefrontSize32);
-      else
-        copySTI().ToggleFeature(AMDGPU::FeatureWavefrontSize64);
-    }
+        !FB[AMDGPU::FeatureWavefrontSize32])
+      // If there is no default wave size it must be a generation before gfx9,
+      // these have FeatureWavefrontSize64 in their definition already. For
+      // gfx10+ set wave32 as a default.
+      copySTI().ToggleFeature(AMDGPU::FeatureWavefrontSize32);
 
     setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits()));
 
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 57d717dd9e634..3f2c3d4d2be8d 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -50,10 +50,10 @@ static const MCSubtargetInfo &addDefaultWaveSize(const MCSubtargetInfo &STI,
   if (!STI.hasFeature(AMDGPU::FeatureWavefrontSize64) &&
       !STI.hasFeature(AMDGPU::FeatureWavefrontSize32)) {
     MCSubtargetInfo &STICopy = Ctx.getSubtargetCopy(STI);
-    if (AMDGPU::isGFX10Plus(STI))
-      STICopy.ToggleFeature(AMDGPU::FeatureWavefrontSize32);
-    else
-      STICopy.ToggleFeature(AMDGPU::FeatureWavefrontSize64);
+    // If there is no default wave size it must be a generation before gfx9,
+    // these have FeatureWavefrontSize64 in their definition already. For gfx10+
+    // set wave32 as a default.
+    STICopy.ToggleFeature(AMDGPU::FeatureWavefrontSize32);
     return STICopy;
   }
 



More information about the llvm-commits mailing list