[llvm] [AMDGPU] Replace `isInlinableLiteral16` with specific version (PR #81345)

Thu Feb 29 20:35:36 PST 2024

https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/81345

>From 9c55e48a87295d80c75dd4a820af5fbfdbaa676c Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Thu, 29 Feb 2024 23:35:24 -0500
Subject: [PATCH] [WIP][AMDGPU] Split `isInlinableLiteral16` into three and
 call the specific version if possible

The current implementation of `isInlinableLiteral16` assumes, a 16-bit inlinable
literal is either an i16 or a fp16. This is not always true because of bf16.
However, we can't tell fp16 and bf16 apart by just looking at the value. This
patch tries to split `isInlinableLiteral16` into three versions, i16, fp16, bf16
respectively, and call the corresponding version.

This patch is based on #81282. The current status is, only two uses of original
`isInlinableLiteral16` are still there. We need to add an extra argument to indicate
the type of the operand the immediate corresponds to. This will also require the
change of the function signature of the two callers.
---
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |  54 +++++--
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp |  18 +--
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h   |   4 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  28 +++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  24 ++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |   4 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  10 +-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |   8 +-
 llvm/test/CodeGen/AMDGPU/immv216.ll           |  42 ++---
 .../test/CodeGen/AMDGPU/inline-constraints.ll |  12 --
 .../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 144 ++++++++++++------
 11 files changed, 229 insertions(+), 119 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 18d51087ff5fba..62bd864ab06b29 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -2006,8 +2006,12 @@ static bool isInlineableLiteralOp16(int64_t Val, MVT VT, bool HasInv2Pi) {
     return isInlinableIntLiteral(Val);
   }
 
-  // f16/v2f16 operands work correctly for all values.
-  return AMDGPU::isInlinableLiteral16(Val, HasInv2Pi);
+  if (VT.getScalarType() == MVT::f16)
+    return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
+
+  assert(VT.getScalarType() == MVT::bf16);
+
+  return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
 }
 
 bool AMDGPUOperand::isInlinableImm(MVT type) const {
@@ -2375,15 +2379,26 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
     return;
 
   case AMDGPU::OPERAND_REG_IMM_INT16:
-  case AMDGPU::OPERAND_REG_IMM_FP16:
-  case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
-  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
   case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+    if (isSafeTruncation(Val, 16) &&
+        AMDGPU::isInlinableIntLiteral(static_cast<int16_t>(Val))) {
+      Inst.addOperand(MCOperand::createImm(Val));
+      setImmKindConst();
+      return;
+    }
+
+    Inst.addOperand(MCOperand::createImm(Val & 0xffff));
+    setImmKindLiteral();
+    return;
+
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
     if (isSafeTruncation(Val, 16) &&
-        AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
-                                     AsmParser->hasInv2PiInlineImm())) {
+        AMDGPU::isInlinableLiteralFP16(static_cast<int16_t>(Val),
+                                       AsmParser->hasInv2PiInlineImm())) {
       Inst.addOperand(MCOperand::createImm(Val));
       setImmKindConst();
       return;
@@ -2410,12 +2425,17 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
     return;
 
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: {
+    assert(isSafeTruncation(Val, 16));
+    assert(AMDGPU::isInlinableIntLiteral(static_cast<int16_t>(Val)));
+    Inst.addOperand(MCOperand::createImm(Val));
+    return;
+  }
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
-  case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
     assert(isSafeTruncation(Val, 16));
-    assert(AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
-                                        AsmParser->hasInv2PiInlineImm()));
+    assert(AMDGPU::isInlinableLiteralFP16(static_cast<int16_t>(Val),
+                                          AsmParser->hasInv2PiInlineImm()));
 
     Inst.addOperand(MCOperand::createImm(Val));
     return;
@@ -3559,7 +3579,19 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
         OperandType == AMDGPU::OPERAND_REG_IMM_V2BF16)
       return AMDGPU::isInlinableLiteralV2BF16(Val);
 
-    return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm());
+    if (OperandType == AMDGPU::OPERAND_REG_IMM_FP16 ||
+        OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP16 ||
+        OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP16 ||
+        OperandType == AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED)
+      return AMDGPU::isInlinableLiteralFP16(Val, hasInv2PiInlineImm());
+
+    if (OperandType == AMDGPU::OPERAND_REG_IMM_BF16 ||
+        OperandType == AMDGPU::OPERAND_REG_INLINE_C_BF16 ||
+        OperandType == AMDGPU::OPERAND_REG_INLINE_AC_BF16 ||
+        OperandType == AMDGPU::OPERAND_REG_IMM_BF16_DEFERRED)
+      return AMDGPU::isInlinableLiteralBF16(Val, hasInv2PiInlineImm());
+
+    llvm_unreachable("invalid operand type");
   }
   default:
     llvm_unreachable("invalid operand size");
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index a32be1e50a6053..683e8dad796c10 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -460,10 +460,8 @@ void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm,
   }
 }
 
-// This must accept a 32-bit immediate value to correctly handle packed 16-bit
-// operations.
-static bool printImmediateFloat16(uint32_t Imm, const MCSubtargetInfo &STI,
-                                  raw_ostream &O) {
+static bool printImmediateFP16(uint32_t Imm, const MCSubtargetInfo &STI,
+                               raw_ostream &O) {
   if (Imm == 0x3C00)
     O << "1.0";
   else if (Imm == 0xBC00)
@@ -529,9 +527,9 @@ void AMDGPUInstPrinter::printImmediateBF16(uint32_t Imm,
   O << formatHex(static_cast<uint64_t>(Imm));
 }
 
-void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
-                                         const MCSubtargetInfo &STI,
-                                         raw_ostream &O) {
+void AMDGPUInstPrinter::printImmediateF16(uint32_t Imm,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O) {
   int16_t SImm = static_cast<int16_t>(Imm);
   if (isInlinableIntLiteral(SImm)) {
     O << SImm;
@@ -539,7 +537,7 @@ void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
   }
 
   uint16_t HImm = static_cast<uint16_t>(Imm);
-  if (printImmediateFloat16(HImm, STI, O))
+  if (printImmediateFP16(HImm, STI, O))
     return;
 
   uint64_t Imm16 = static_cast<uint16_t>(Imm);
@@ -566,7 +564,7 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType,
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
   case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
     if (isUInt<16>(Imm) &&
-        printImmediateFloat16(static_cast<uint16_t>(Imm), STI, O))
+        printImmediateFP16(static_cast<uint16_t>(Imm), STI, O))
       return;
     break;
   case AMDGPU::OPERAND_REG_IMM_V2BF16:
@@ -845,7 +843,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
     case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
     case AMDGPU::OPERAND_REG_IMM_FP16:
     case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
-      printImmediate16(Op.getImm(), STI, O);
+      printImmediateF16(Op.getImm(), STI, O);
       break;
     case AMDGPU::OPERAND_REG_INLINE_C_BF16:
     case AMDGPU::OPERAND_REG_INLINE_AC_BF16:
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 15ecbf2e5e5918..c801eaf1111e2f 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -86,10 +86,10 @@ class AMDGPUInstPrinter : public MCInstPrinter {
                       raw_ostream &O);
   void printImmediateInt16(uint32_t Imm, const MCSubtargetInfo &STI,
                            raw_ostream &O);
-  void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
-                        raw_ostream &O);
   void printImmediateBF16(uint32_t Imm, const MCSubtargetInfo &STI,
                           raw_ostream &O);
+  void printImmediateF16(uint32_t Imm, const MCSubtargetInfo &STI,
+                         raw_ostream &O);
   void printImmediateV216(uint32_t Imm, uint8_t OpType,
                           const MCSubtargetInfo &STI, raw_ostream &O);
   bool printImmediateFloat32(uint32_t Imm, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 84ef9679ab9563..2e11674109ef1f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15374,16 +15374,32 @@ bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,
   llvm_unreachable("Invalid asm constraint");
 }
 
-bool SITargetLowering::checkAsmConstraintValA(SDValue Op,
-                                              uint64_t Val,
+bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val,
                                               unsigned MaxSize) const {
   unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
   bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
-  if ((Size == 16 && AMDGPU::isInlinableLiteral16(Val, HasInv2Pi)) ||
-      (Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
-      (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi))) {
+  if (Size == 16) {
+    MVT VT = Op.getSimpleValueType();
+    switch (VT.SimpleTy) {
+    default:
+      return false;
+    case MVT::i16:
+      return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
+    case MVT::f16:
+      return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
+    case MVT::bf16:
+      return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
+    case MVT::v2i16:
+      return AMDGPU::getInlineEncodingV2I16(Val).has_value();
+    case MVT::v2f16:
+      return AMDGPU::getInlineEncodingV2F16(Val).has_value();
+    case MVT::v2bf16:
+      return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
+    }
+  }
+  if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
+      (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
     return true;
-  }
   return false;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 31ced9d41e15e2..40de39dae1758a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4121,8 +4121,26 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
                                         ST.hasInv2PiInlineImm());
   case 16:
     return ST.has16BitInsts() &&
-           AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
-                                        ST.hasInv2PiInlineImm());
+           AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
+                                         ST.hasInv2PiInlineImm());
+  default:
+    llvm_unreachable("invalid bitwidth");
+  }
+}
+
+bool SIInstrInfo::isInlineConstant(const APFloat &Imm) const {
+  APInt IntImm = Imm.bitcastToAPInt();
+  bool HasInv2Pi = ST.hasInv2PiInlineImm();
+  switch (IntImm.getBitWidth()) {
+  case 32:
+  case 64:
+    return isInlineConstant(IntImm);
+  case 16:
+    return ST.has16BitInsts() &&
+           (Imm.isIEEE() ? AMDGPU::isInlinableLiteralFP16(IntImm.getSExtValue(),
+                                                          HasInv2Pi)
+                         : AMDGPU::isInlinableLiteralBF16(IntImm.getSExtValue(),
+                                                          HasInv2Pi));
   default:
     llvm_unreachable("invalid bitwidth");
   }
@@ -4200,7 +4218,7 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
       // constants in these cases
       int16_t Trunc = static_cast<int16_t>(Imm);
       return ST.has16BitInsts() &&
-             AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
+             AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
     }
 
     return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index a8a33a5fecb413..fe271e8ce790c9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -984,9 +984,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   bool isInlineConstant(const APInt &Imm) const;
 
-  bool isInlineConstant(const APFloat &Imm) const {
-    return isInlineConstant(Imm.bitcastToAPInt());
-  }
+  bool isInlineConstant(const APFloat &Imm) const;
 
   // Returns true if this non-register operand definitely does not need to be
   // encoded as a 32-bit literal. Note that this function handles all kinds of
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 963dc2882fcc0d..63285c06edaf2c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2647,13 +2647,19 @@ bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi) {
          Val == 0x3E22;   // 1.0 / (2.0 * pi)
 }
 
-bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
+bool isInlinableLiteralI16(int16_t Literal, bool HasInv2Pi) {
   if (!HasInv2Pi)
     return false;
-
   if (isInlinableIntLiteral(Literal))
     return true;
+  return Literal == static_cast<int16_t>(0x3e22f983);
+}
 
+bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi) {
+  if (!HasInv2Pi)
+    return false;
+  if (isInlinableIntLiteral(Literal))
+    return true;
   uint16_t Val = static_cast<uint16_t>(Literal);
   return Val == 0x3C00 || // 1.0
          Val == 0xBC00 || // -1.0
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 6edf01d1217f2d..9fcb4caca30b01 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1397,7 +1397,13 @@ LLVM_READNONE
 bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi);
 
 LLVM_READNONE
-bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);
+bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi);
+
+LLVM_READNONE
+bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi);
+
+LLVM_READNONE
+bool isInlinableLiteralI16(int16_t Literal, bool HasInv2Pi);
 
 LLVM_READNONE
 std::optional<unsigned> getInlineEncodingV2I16(uint32_t Literal);
diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll
index b66ca71a327495..ae51c3edf1c7e7 100644
--- a/llvm/test/CodeGen/AMDGPU/immv216.ll
+++ b/llvm/test/CodeGen/AMDGPU/immv216.ll
@@ -577,40 +577,40 @@ define amdgpu_kernel void @add_inline_imm_64_v2f16(ptr addrspace(1) %out, <2 x h
 }
 
 ; GCN-LABEL: {{^}}mul_inline_imm_0.5_v2i16:
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x38003800
-; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
+; GFX9: s_movk_i32 [[K:s[0-9]+]], 0x3800
+; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0]
 
-; GFX10: v_pk_mul_lo_u16 v0, 0x38003800, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x38,0x00,0x38]
+; GFX10: v_pk_mul_lo_u16 v0, 0x3800, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x38,0x00,0x00]
 define <2 x i16> @mul_inline_imm_0.5_v2i16(<2 x i16> %x) {
   %y = mul <2 x i16> %x, bitcast (<2 x half> <half 0.5, half 0.5> to <2 x i16>)
   ret <2 x i16> %y
 }
 
 ; GCN-LABEL: {{^}}mul_inline_imm_neg_0.5_v2i16:
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xb800b800
-; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
+; GFX9: s_movk_i32 [[K:s[0-9]+]], 0xb800
+; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0]
 
-; GFX10: v_pk_mul_lo_u16 v0, 0xb800b800, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xb8,0x00,0xb8]
+; GFX10: v_pk_mul_lo_u16 v0, 0xffffb800, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xb8,0xff,0xff]
 define <2 x i16> @mul_inline_imm_neg_0.5_v2i16(<2 x i16> %x) {
   %y = mul <2 x i16> %x, bitcast (<2 x half> <half -0.5, half -0.5> to <2 x i16>)
   ret <2 x i16> %y
 }
 
 ; GCN-LABEL: {{^}}mul_inline_imm_1.0_v2i16:
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x3c003c00
-; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
+; GFX9: s_movk_i32 [[K:s[0-9]+]], 0x3c00
+; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0]
 
-; GFX10: v_pk_mul_lo_u16 v0, 0x3c003c00, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x3c,0x00,0x3c]
+; GFX10: v_pk_mul_lo_u16 v0, 0x3c00, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x3c,0x00,0x00]
 define <2 x i16> @mul_inline_imm_1.0_v2i16(<2 x i16> %x) {
   %y = mul <2 x i16> %x, bitcast (<2 x half> <half 1.0, half 1.0> to <2 x i16>)
   ret <2 x i16> %y
 }
 
 ; GCN-LABEL: {{^}}mul_inline_imm_neg_1.0_v2i16:
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00bc00
-; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
+; GFX9: s_movk_i32 [[K:s[0-9]+]], 0xbc00
+; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0]
 
-; GFX10: v_pk_mul_lo_u16 v0, 0xbc00bc00, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xbc,0x00,0xbc]
+; GFX10: v_pk_mul_lo_u16 v0, 0xffffbc00, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xbc,0xff,0xff]
 define <2 x i16> @mul_inline_imm_neg_1.0_v2i16(<2 x i16> %x) {
   %y = mul <2 x i16> %x, bitcast (<2 x half> <half -1.0, half -1.0> to <2 x i16>)
   ret <2 x i16> %y
@@ -635,10 +635,10 @@ define <2 x i16> @shl_inline_imm_neg_2.0_v2i16(<2 x i16> %x) {
 }
 
 ; GCN-LABEL: {{^}}mul_inline_imm_4.0_v2i16:
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004400
-; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
+; GFX9: s_movk_i32 [[K:s[0-9]+]], 0x4400
+; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0]
 
-; GFX10: v_pk_mul_lo_u16 v0, 0x44004400, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x44,0x00,0x44]
+; GFX10: v_pk_mul_lo_u16 v0, 0x4400, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x44,0x00,0x00]
 define <2 x i16> @mul_inline_imm_4.0_v2i16(<2 x i16> %x) {
   %y = mul <2 x i16> %x, bitcast (<2 x half> <half 4.0, half 4.0> to <2 x i16>)
   ret <2 x i16> %y
@@ -646,20 +646,20 @@ define <2 x i16> @mul_inline_imm_4.0_v2i16(<2 x i16> %x) {
 }
 
 ; GCN-LABEL: {{^}}mul_inline_imm_neg_4.0_v2i16:
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xc400c400
-; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
+; GFX9: s_movk_i32 [[K:s[0-9]+]], 0xc400
+; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0]
 
-; GFX10: v_pk_mul_lo_u16 v0, 0xc400c400, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc4,0x00,0xc4]
+; GFX10: v_pk_mul_lo_u16 v0, 0xffffc400, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc4,0xff,0xff]
 define <2 x i16> @mul_inline_imm_neg_4.0_v2i16(<2 x i16> %x) {
   %y = mul <2 x i16> %x, bitcast (<2 x half> <half -4.0, half -4.0> to <2 x i16>)
   ret <2 x i16> %y
 }
 
 ; GCN-LABEL: {{^}}mul_inline_imm_inv2pi_v2i16:
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x31183118
-; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
+; GFX9: s_movk_i32 [[K:s[0-9]+]], 0x3118
+; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0]
 
-; GFX10: v_pk_mul_lo_u16 v0, 0x31183118, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x18,0x31,0x18,0x31]
+; GFX10: v_pk_mul_lo_u16 v0, 0x3118, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x18,0x31,0x00,0x00]
 define <2 x i16> @mul_inline_imm_inv2pi_v2i16(<2 x i16> %x) {
   %y = mul <2 x i16> %x, bitcast (<2 x half> <half 0xH3118, half 0xH3118> to <2 x i16>)
   ret <2 x i16> %y
diff --git a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
index 9ef246fe2e1015..7bd6b037386b04 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
@@ -97,7 +97,6 @@ define i32 @inline_A_constraint_H1() {
 
 ; NOSI: error: invalid operand for inline asm constraint 'A'
 ; VI-LABEL: {{^}}inline_A_constraint_H2:
-; VI: v_mov_b32 {{v[0-9]+}}, 0x3c00
 define i32 @inline_A_constraint_H2() {
   %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 bitcast (half 1.0 to i16))
   ret i32 %v0
@@ -105,7 +104,6 @@ define i32 @inline_A_constraint_H2() {
 
 ; NOSI: error: invalid operand for inline asm constraint 'A'
 ; VI-LABEL: {{^}}inline_A_constraint_H3:
-; VI: v_mov_b32 {{v[0-9]+}}, 0xbc00
 define i32 @inline_A_constraint_H3() {
   %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 bitcast (half -1.0 to i16))
   ret i32 %v0
@@ -113,7 +111,6 @@ define i32 @inline_A_constraint_H3() {
 
 ; NOSI: error: invalid operand for inline asm constraint 'A'
 ; VI-LABEL: {{^}}inline_A_constraint_H4:
-; VI: v_mov_b32 {{v[0-9]+}}, 0x3118
 define i32 @inline_A_constraint_H4() {
   %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(half 0xH3118)
   ret i32 %v0
@@ -121,7 +118,6 @@ define i32 @inline_A_constraint_H4() {
 
 ; NOSI: error: invalid operand for inline asm constraint 'A'
 ; VI-LABEL: {{^}}inline_A_constraint_H5:
-; VI: v_mov_b32 {{v[0-9]+}}, 0x3118
 define i32 @inline_A_constraint_H5() {
   %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 bitcast (half 0xH3118 to i16))
   ret i32 %v0
@@ -129,7 +125,6 @@ define i32 @inline_A_constraint_H5() {
 
 ; NOSI: error: invalid operand for inline asm constraint 'A'
 ; VI-LABEL: {{^}}inline_A_constraint_H6:
-; VI: v_mov_b32 {{v[0-9]+}}, 0xb800
 define i32 @inline_A_constraint_H6() {
   %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(half -0.5)
   ret i32 %v0
@@ -293,7 +288,6 @@ define i32 @inline_A_constraint_V0() {
 
 ; NOSI: error: invalid operand for inline asm constraint 'A'
 ; VI-LABEL: {{^}}inline_A_constraint_V1:
-; VI: v_mov_b32 {{v[0-9]+}}, 0xb800
 define i32 @inline_A_constraint_V1() {
   %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(<2 x half> <half -0.5, half -0.5>)
   ret i32 %v0
@@ -970,7 +964,6 @@ define i32 @inline_DA_constraint_H1() {
 
 ; NOSI: error: invalid operand for inline asm constraint 'DA'
 ; VI-LABEL: {{^}}inline_DA_constraint_H2:
-; VI: v_mov_b32 {{v[0-9]+}}, 0x3c00
 define i32 @inline_DA_constraint_H2() {
   %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,^DA"(i16 bitcast (half 1.0 to i16))
   ret i32 %v0
@@ -978,7 +971,6 @@ define i32 @inline_DA_constraint_H2() {
 
 ; NOSI: error: invalid operand for inline asm constraint 'DA'
 ; VI-LABEL: {{^}}inline_DA_constraint_H3:
-; VI: v_mov_b32 {{v[0-9]+}}, 0xbc00
 define i32 @inline_DA_constraint_H3() {
   %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,^DA"(i16 bitcast (half -1.0 to i16))
   ret i32 %v0
@@ -986,7 +978,6 @@ define i32 @inline_DA_constraint_H3() {
 
 ; NOSI: error: invalid operand for inline asm constraint 'DA'
 ; VI-LABEL: {{^}}inline_DA_constraint_H4:
-; VI: v_mov_b32 {{v[0-9]+}}, 0x3118
 define i32 @inline_DA_constraint_H4() {
   %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,^DA"(half 0xH3118)
   ret i32 %v0
@@ -994,7 +985,6 @@ define i32 @inline_DA_constraint_H4() {
 
 ; NOSI: error: invalid operand for inline asm constraint 'DA'
 ; VI-LABEL: {{^}}inline_DA_constraint_H5:
-; VI: v_mov_b32 {{v[0-9]+}}, 0x3118
 define i32 @inline_DA_constraint_H5() {
   %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,^DA"(i16 bitcast (half 0xH3118 to i16))
   ret i32 %v0
@@ -1002,7 +992,6 @@ define i32 @inline_DA_constraint_H5() {
 
 ; NOSI: error: invalid operand for inline asm constraint 'DA'
 ; VI-LABEL: {{^}}inline_DA_constraint_H6:
-; VI: v_mov_b32 {{v[0-9]+}}, 0xb800
 define i32 @inline_DA_constraint_H6() {
   %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,^DA"(half -0.5)
   ret i32 %v0
@@ -1164,7 +1153,6 @@ define i32 @inline_DA_constraint_V0() {
 
 ; NOSI: error: invalid operand for inline asm constraint 'DA'
 ; VI-LABEL: {{^}}inline_DA_constraint_V1:
-; VI: v_mov_b32 {{v[0-9]+}}, 0xb800
 define i32 @inline_DA_constraint_V1() {
   %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,^DA"(<2 x half> <half -0.5, half -0.5>)
   ret i32 %v0
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 5de9b0b92c9a02..1a55bf608ebf51 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -3400,9 +3400,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0xc400c400
+; GFX9-SDAG-NEXT:    s_movk_i32 s2, 0xc400
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, s2
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
@@ -3418,29 +3418,53 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
 ; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: v_test_v2i16_x_add_neg_fpone:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_add_u16 v1, 0xc400c400, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
+; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_test_v2i16_x_add_neg_fpone:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_add_u16 v1, 0xc400c400, v1
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xc400c400, v1
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1]
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xc400c400, v1
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -3541,9 +3565,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0x44004400
+; GFX9-SDAG-NEXT:    s_movk_i32 s2, 0x4400
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, s2
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
@@ -3559,29 +3583,53 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
 ; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: v_test_v2i16_x_add_neg_negfpone:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_add_u16 v1, 0x44004400, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
+; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_test_v2i16_x_add_neg_negfpone:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_add_u16 v1, 0x44004400, v1
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0x44004400, v1
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0x44004400, v1
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext