[llvm] b24436a - GlobalISel: Lower funnel shifts

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 23 06:11:24 PDT 2021


Author: Matt Arsenault
Date: 2021-03-23T09:11:17-04:00
New Revision: b24436ac96bdf3f2c545fc85dc8af239d618c9c4

URL: https://github.com/llvm/llvm-project/commit/b24436ac96bdf3f2c545fc85dc8af239d618c9c4
DIFF: https://github.com/llvm/llvm-project/commit/b24436ac96bdf3f2c545fc85dc8af239d618c9c4.diff

LOG: GlobalISel: Lower funnel shifts

Added: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir

Modified: 
    llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
    llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
    llvm/include/llvm/CodeGen/GlobalISel/Utils.h
    llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
    llvm/lib/CodeGen/GlobalISel/Utils.cpp
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 8a96e379db6f..9d09e8a9015f 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -345,6 +345,9 @@ class LegalizerHelper {
   LegalizeResult lowerLoad(MachineInstr &MI);
   LegalizeResult lowerStore(MachineInstr &MI);
   LegalizeResult lowerBitCount(MachineInstr &MI);
+  LegalizeResult lowerFunnelShiftWithInverse(MachineInstr &MI);
+  LegalizeResult lowerFunnelShiftAsShifts(MachineInstr &MI);
+  LegalizeResult lowerFunnelShift(MachineInstr &MI);
 
   LegalizeResult lowerU64ToF32BitOps(MachineInstr &MI);
   LegalizeResult lowerUITOFP(MachineInstr &MI);

diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 9cb86db65bc5..3868211c0298 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -1444,6 +1444,13 @@ class MachineIRBuilder {
     return buildInstr(TargetOpcode::G_SMULH, {Dst}, {Src0, Src1}, Flags);
   }
 
+  /// Build and insert \p Res = G_UREM \p Op0, \p Op1
+  MachineInstrBuilder buildURem(const DstOp &Dst, const SrcOp &Src0,
+                                const SrcOp &Src1,
+                                Optional<unsigned> Flags = None) {
+    return buildInstr(TargetOpcode::G_UREM, {Dst}, {Src0, Src1}, Flags);
+  }
+
   MachineInstrBuilder buildFMul(const DstOp &Dst, const SrcOp &Src0,
                                 const SrcOp &Src1,
                                 Optional<unsigned> Flags = None) {

diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
index f74a37e60450..ddf78356615d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
@@ -328,6 +328,13 @@ bool isBuildVectorAllOnes(const MachineInstr &MI,
 Optional<RegOrConstant> getVectorSplat(const MachineInstr &MI,
                                        const MachineRegisterInfo &MRI);
 
+/// Attempt to match a unary predicate against a scalar/splat constant or every
+/// element of a constant G_BUILD_VECTOR. If \p ConstVal is null, the source
+/// value was undef.
+bool matchUnaryPredicate(const MachineRegisterInfo &MRI, Register Reg,
+                         std::function<bool(const Constant *ConstVal)> Match,
+                         bool AllowUndefs = false);
+
 /// Returns true if given the TargetLowering's boolean contents information,
 /// the value \p Val contains a true value.
 bool isConstTrueVal(const TargetLowering &TLI, int64_t Val, bool IsVector,

diff  --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 97a5c6444cd0..9005f197ea4c 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3210,6 +3210,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
   case G_SDIVREM:
   case G_UDIVREM:
     return lowerDIVREM(MI);
+  case G_FSHL:
+  case G_FSHR:
+    return lowerFunnelShift(MI);
   }
 }
 
@@ -5207,6 +5210,132 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI) {
   }
 }
 
+// Check that (every element of) Reg is undef or not an exact multiple of BW.
+static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
+                                        Register Reg, unsigned BW) {
+  return matchUnaryPredicate(
+      MRI, Reg,
+      [=](const Constant *C) {
+        // Null constant here means an undef.
+        const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
+        return !CI || CI->getValue().urem(BW) != 0;
+      },
+      /*AllowUndefs*/ true);
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register X = MI.getOperand(1).getReg();
+  Register Y = MI.getOperand(2).getReg();
+  Register Z = MI.getOperand(3).getReg();
+  LLT Ty = MRI.getType(Dst);
+  LLT ShTy = MRI.getType(Z);
+
+  unsigned BW = Ty.getScalarSizeInBits();
+  const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
+  unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
+
+  if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
+    // fshl X, Y, Z -> fshr X, Y, -Z
+    // fshr X, Y, Z -> fshl X, Y, -Z
+    auto Zero = MIRBuilder.buildConstant(ShTy, 0);
+    Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
+  } else {
+    // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
+    // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
+    auto One = MIRBuilder.buildConstant(ShTy, 1);
+    if (IsFSHL) {
+      Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
+      X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
+    } else {
+      X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
+      Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
+    }
+
+    Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
+  }
+
+  MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register X = MI.getOperand(1).getReg();
+  Register Y = MI.getOperand(2).getReg();
+  Register Z = MI.getOperand(3).getReg();
+  LLT Ty = MRI.getType(Dst);
+  LLT ShTy = MRI.getType(Z);
+
+  const unsigned BW = Ty.getScalarSizeInBits();
+  const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
+
+  Register ShX, ShY;
+  Register ShAmt, InvShAmt;
+
+  // FIXME: Emit optimized urem by constant instead of letting it expand later.
+  if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
+    // fshl: X << C | Y >> (BW - C)
+    // fshr: X << (BW - C) | Y >> C
+    // where C = Z % BW is not zero
+    auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
+    ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
+    InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
+    ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
+    ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
+  } else {
+    // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
+    // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
+    auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
+    if (isPowerOf2_32(BW)) {
+      // Z % BW -> Z & (BW - 1)
+      ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
+      // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
+      auto NotZ = MIRBuilder.buildNot(ShTy, Z);
+      InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
+    } else {
+      auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
+      ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
+      InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
+    }
+
+    auto One = MIRBuilder.buildConstant(ShTy, 1);
+    if (IsFSHL) {
+      ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
+      auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
+      ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
+    } else {
+      auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
+      ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
+      ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
+    }
+  }
+
+  MIRBuilder.buildOr(Dst, ShX, ShY);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
+  // These operations approximately do the following (while avoiding undefined
+  // shifts by BW):
+  // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
+  // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
+
+  bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
+  unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
+  if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
+    return lowerFunnelShiftAsShifts(MI);
+  return lowerFunnelShiftWithInverse(MI);
+}
+
 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
 // representation.
 LegalizerHelper::LegalizeResult

diff  --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index dcd3e4646f43..68f51c3702e4 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -926,6 +926,38 @@ Optional<RegOrConstant> llvm::getVectorSplat(const MachineInstr &MI,
   return RegOrConstant(Reg);
 }
 
+bool llvm::matchUnaryPredicate(
+    const MachineRegisterInfo &MRI, Register Reg,
+    std::function<bool(const Constant *ConstVal)> Match, bool AllowUndefs) {
+
+  const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+  if (AllowUndefs && Def->getOpcode() == TargetOpcode::G_IMPLICIT_DEF)
+    return Match(nullptr);
+
+  // TODO: Also handle fconstant
+  if (Def->getOpcode() == TargetOpcode::G_CONSTANT)
+    return Match(Def->getOperand(1).getCImm());
+
+  if (Def->getOpcode() != TargetOpcode::G_BUILD_VECTOR)
+    return false;
+
+  for (unsigned I = 1, E = Def->getNumOperands(); I != E; ++I) {
+    Register SrcElt = Def->getOperand(I).getReg();
+    const MachineInstr *SrcDef = getDefIgnoringCopies(SrcElt, MRI);
+    if (AllowUndefs && SrcDef->getOpcode() == TargetOpcode::G_IMPLICIT_DEF) {
+      if (!Match(nullptr))
+        return false;
+      continue;
+    }
+
+    if (SrcDef->getOpcode() != TargetOpcode::G_CONSTANT ||
+        !Match(SrcDef->getOperand(1).getCImm()))
+      return false;
+  }
+
+  return true;
+}
+
 bool llvm::isConstTrueVal(const TargetLowering &TLI, int64_t Val, bool IsVector,
                           bool IsFP) {
   switch (TLI.getBooleanContents(IsVector, IsFP)) {

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 7c50e17a8b41..d608bb873f07 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1595,11 +1595,26 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .clampScalar(0, S32, S64)
     .lower();
 
+  // TODO: Only Try to form v2s16 with legal packed instructions.
   getActionDefinitionsBuilder(G_FSHR)
     .legalFor({{S32, S32}})
+    .lowerFor({{V2S16, V2S16}})
+    .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
     .scalarize(0)
     .lower();
 
+  if (ST.hasVOP3PInsts()) {
+    getActionDefinitionsBuilder(G_FSHL)
+      .lowerFor({{V2S16, V2S16}})
+      .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
+      .scalarize(0)
+      .lower();
+  } else {
+    getActionDefinitionsBuilder(G_FSHL)
+      .scalarize(0)
+      .lower();
+  }
+
   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
     .legalFor({S64});
 
@@ -1624,9 +1639,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       G_SADDO, G_SSUBO,
 
        // TODO: Implement
-      G_FMINIMUM, G_FMAXIMUM,
-      G_FSHL
-    }).lower();
+      G_FMINIMUM, G_FMAXIMUM}).lower();
 
   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
new file mode 100644
index 000000000000..5e98c4e6abd1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -0,0 +1,7474 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s
+
+define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
+; GFX6-LABEL: s_fshl_i7:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    s_sub_i32 s3, 0, 7
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX6-NEXT:    s_movk_i32 s3, 0x7f
+; GFX6-NEXT:    s_and_b32 s2, s2, s3
+; GFX6-NEXT:    s_and_b32 s1, s1, s3
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 7
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 7, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 7, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 6, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX6-NEXT:    v_lshl_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_lshr_b32_e32 v1, s1, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_i7:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    s_sub_i32 s3, 0, 7
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX8-NEXT:    s_movk_i32 s3, 0x7f
+; GFX8-NEXT:    s_and_b32 s2, s2, s3
+; GFX8-NEXT:    s_and_b32 s1, s1, s3
+; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 7
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, 7, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, 7, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_sub_u16_e32 v1, 6, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX8-NEXT:    v_lshlrev_b16_e64 v0, v0, s0
+; GFX8-NEXT:    v_lshrrev_b16_e64 v1, v1, s1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_i7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_sub_i32 s3, 0, 7
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX9-NEXT:    s_movk_i32 s3, 0x7f
+; GFX9-NEXT:    s_and_b32 s2, s2, s3
+; GFX9-NEXT:    s_and_b32 s1, s1, s3
+; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 7
+; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v1, 7, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v1, 7, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_sub_u16_e32 v1, 6, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX9-NEXT:    v_lshlrev_b16_e64 v0, v0, s0
+; GFX9-NEXT:    v_lshrrev_b16_e64 v1, v1, s1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_i7:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
+; GFX10-NEXT:    s_sub_i32 s3, 0, 7
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX10-NEXT:    s_movk_i32 s3, 0x7f
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    s_and_b32 s1, s1, s3
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX10-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 7
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 7, v0
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 7, v0
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u16_e64 v1, 6, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, v0, s0
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, v1, s1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i7 @llvm.fshl.i7(i7 %lhs, i7 %rhs, i7 %amt)
+  ret i7 %result
+}
+
+define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
+; GFX6-LABEL: v_fshl_i7:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX6-NEXT:    s_sub_i32 s4, 0, 7
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7f, v2
+; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, 0x7f
+; GFX6-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_mul_lo_u32 v3, v3, 7
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 7, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 7, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 6, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, v3, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_i7:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX8-NEXT:    s_sub_i32 s4, 0, 7
+; GFX8-NEXT:    v_and_b32_e32 v2, 0x7f, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX8-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GFX8-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
+; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7f
+; GFX8-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v1
+; GFX8-NEXT:    v_mul_lo_u32 v3, v3, 7
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 7, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 7, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_sub_u16_e32 v3, 6, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, v3, v4
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_i7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX9-NEXT:    s_sub_i32 s4, 0, 7
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x7f, v2
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
+; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7f
+; GFX9-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 1, v1
+; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 7
+; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v3, 7, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v3, 7, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_sub_u16_e32 v3, 6, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, v3, v4
+; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_i7:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
+; GFX10-NEXT:    s_sub_i32 s4, 0, 7
+; GFX10-NEXT:    v_and_b32_e32 v2, 0x7f, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0x7f, v1
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, 1, v1
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX10-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GFX10-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v4
+; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX10-NEXT:    v_mul_lo_u32 v3, v3, 7
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 7, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 7, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7f
+; GFX10-NEXT:    v_sub_nc_u16_e64 v4, 6, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, v2, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, v4, v3
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, v2, v0
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, v3, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i7 @llvm.fshl.i7(i7 %lhs, i7 %rhs, i7 %amt)
+  ret i7 %result
+}
+
+define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
+; GFX6-LABEL: s_fshl_i8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX6-NEXT:    s_and_b32 s3, s2, 7
+; GFX6-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX6-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_i8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_and_b32 s3, s2, 7
+; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_and_b32 s3, s2, 7
+; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10-NEXT:    s_and_b32 s3, s2, 7
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 %amt)
+  ret i8 %result
+}
+
+define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
+; GFX6-LABEL: v_fshl_i8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v3, 7, v2
+; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_i8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v3, 7, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 1
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 7, v2
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 1
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, 1, v1
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, v2, v0
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, v3, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 %amt)
+  ret i8 %result
+}
+
+define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
+; GFX6-LABEL: s_fshl_i8_4:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 4
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_i8_4:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX8-NEXT:    s_lshr_b32 s1, s1, 4
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_i8_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 4
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_i8_4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 4
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 4)
+  ret i8 %result
+}
+
+define i8 @v_fshl_i8_4(i8 %lhs, i8 %rhs) {
+; GFX6-LABEL: v_fshl_i8_4:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 4, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_i8_4:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, 4
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_i8_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 4
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_i8_4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, 4, v0
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, 4, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 4)
+  ret i8 %result
+}
+
+define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
+; GFX6-LABEL: s_fshl_i8_5:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 3
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_i8_5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX8-NEXT:    s_lshr_b32 s1, s1, 3
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_i8_5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 3
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_i8_5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 3
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 5)
+  ret i8 %result
+}
+
+define i8 @v_fshl_i8_5(i8 %lhs, i8 %rhs) {
+; GFX6-LABEL: v_fshl_i8_5:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 3, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_i8_5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, 3
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 5, v0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_i8_5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, 3
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 5, v0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_i8_5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, 5, v0
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, 3, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 5)
+  ret i8 %result
+}
+
+define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 inreg %amt.arg) {
+; GFX6-LABEL: s_fshl_v2i8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s5, s2, 7
+; GFX6-NEXT:    s_movk_i32 s6, 0xff
+; GFX6-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s5
+; GFX6-NEXT:    s_and_b32 s5, s1, s6
+; GFX6-NEXT:    s_lshr_b32 s4, s2, 8
+; GFX6-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX6-NEXT:    s_lshr_b32 s5, s5, 1
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 9
+; GFX6-NEXT:    s_lshr_b32 s2, s5, s2
+; GFX6-NEXT:    s_or_b32 s0, s0, s2
+; GFX6-NEXT:    s_and_b32 s2, s4, 7
+; GFX6-NEXT:    s_andn2_b32 s4, 7, s4
+; GFX6-NEXT:    s_and_b32 s1, s1, 0x7f
+; GFX6-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX6-NEXT:    s_lshr_b32 s1, s1, s4
+; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_and_b32 s1, s1, s6
+; GFX6-NEXT:    s_and_b32 s0, s0, s6
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_v2i8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s6, s2, 7
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s6
+; GFX8-NEXT:    s_movk_i32 s6, 0xff
+; GFX8-NEXT:    s_lshr_b32 s4, s1, 8
+; GFX8-NEXT:    s_and_b32 s1, s1, s6
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s5, 7
+; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX8-NEXT:    s_and_b32 s3, s4, s6
+; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX8-NEXT:    s_andn2_b32 s2, 7, s5
+; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX8-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s1, s1, s6
+; GFX8-NEXT:    s_bfe_u32 s2, 8, 0x100000
+; GFX8-NEXT:    s_and_b32 s0, s0, s6
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_v2i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s6, s2, 7
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s6
+; GFX9-NEXT:    s_movk_i32 s6, 0xff
+; GFX9-NEXT:    s_lshr_b32 s4, s1, 8
+; GFX9-NEXT:    s_and_b32 s1, s1, s6
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s5, 7
+; GFX9-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX9-NEXT:    s_and_b32 s3, s4, s6
+; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX9-NEXT:    s_andn2_b32 s2, 7, s5
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX9-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s1, s1, s6
+; GFX9-NEXT:    s_bfe_u32 s2, 8, 0x100000
+; GFX9-NEXT:    s_and_b32 s0, s0, s6
+; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_v2i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s4, s1, 8
+; GFX10-NEXT:    s_movk_i32 s6, 0xff
+; GFX10-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX10-NEXT:    s_and_b32 s4, s4, s6
+; GFX10-NEXT:    s_and_b32 s1, s1, s6
+; GFX10-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX10-NEXT:    s_and_b32 s7, s2, 7
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s7
+; GFX10-NEXT:    s_and_b32 s7, s5, 7
+; GFX10-NEXT:    s_andn2_b32 s5, 7, s5
+; GFX10-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX10-NEXT:    s_lshl_b32 s3, s3, s7
+; GFX10-NEXT:    s_lshr_b32 s4, s4, s5
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX10-NEXT:    s_or_b32 s2, s3, s4
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    s_and_b32 s1, s2, s6
+; GFX10-NEXT:    s_bfe_u32 s2, 8, 0x100000
+; GFX10-NEXT:    s_and_b32 s0, s0, s6
+; GFX10-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %lhs = bitcast i16 %lhs.arg to <2 x i8>
+  %rhs = bitcast i16 %rhs.arg to <2 x i8>
+  %amt = bitcast i16 %amt.arg to <2 x i8>
+  %result = call <2 x i8> @llvm.fshl.v2i8(<2 x i8> %lhs, <2 x i8> %rhs, <2 x i8> %amt)
+  %cast.result = bitcast <2 x i8> %result to i16
+  ret i16 %cast.result
+}
+
+define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
+; GFX6-LABEL: v_fshl_v2i8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xff, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX6-NEXT:    v_and_b32_e32 v5, 7, v2
+; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 1, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v5, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v2, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 9, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 7, v4
+; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, 7, v4
+; GFX6-NEXT:    v_and_b32_e32 v1, 0x7f, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v2, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v4, v1
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0xff
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, v1, v5
+; GFX6-NEXT:    v_and_b32_e32 v0, v0, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_v2i8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v6, 7, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v6, v0
+; GFX8-NEXT:    v_mov_b32_e32 v6, 1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v5
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 7, v5
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v2, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_v2i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v6, 7, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v5
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 7, v5
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v2, v3
+; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_v2i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX10-NEXT:    s_movk_i32 s4, 0xff
+; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v3
+; GFX10-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX10-NEXT:    v_lshrrev_b16_e64 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, 1, v1
+; GFX10-NEXT:    v_lshlrev_b16_e64 v3, v3, v5
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, v2, v0
+; GFX10-NEXT:    v_lshrrev_b16_e64 v4, v6, v4
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, v7, v1
+; GFX10-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_sdwa v1, v2, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %lhs = bitcast i16 %lhs.arg to <2 x i8>
+  %rhs = bitcast i16 %rhs.arg to <2 x i8>
+  %amt = bitcast i16 %amt.arg to <2 x i8>
+  %result = call <2 x i8> @llvm.fshl.v2i8(<2 x i8> %lhs, <2 x i8> %rhs, <2 x i8> %amt)
+  %cast.result = bitcast <2 x i8> %result to i16
+  ret i16 %cast.result
+}
+
+define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 inreg %amt.arg) {
+; GFX6-LABEL: s_fshl_v4i8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s9, s2, 7
+; GFX6-NEXT:    s_movk_i32 s10, 0xff
+; GFX6-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX6-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX6-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s9
+; GFX6-NEXT:    s_and_b32 s9, s1, s10
+; GFX6-NEXT:    s_lshr_b32 s6, s2, 8
+; GFX6-NEXT:    s_lshr_b32 s7, s2, 16
+; GFX6-NEXT:    s_lshr_b32 s8, s2, 24
+; GFX6-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX6-NEXT:    s_lshr_b32 s9, s9, 1
+; GFX6-NEXT:    s_lshr_b32 s2, s9, s2
+; GFX6-NEXT:    s_or_b32 s0, s0, s2
+; GFX6-NEXT:    s_and_b32 s2, s6, 7
+; GFX6-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX6-NEXT:    s_lshr_b32 s3, s1, 9
+; GFX6-NEXT:    s_movk_i32 s9, 0x7f
+; GFX6-NEXT:    s_andn2_b32 s6, 7, s6
+; GFX6-NEXT:    s_and_b32 s3, s3, s9
+; GFX6-NEXT:    s_lshr_b32 s3, s3, s6
+; GFX6-NEXT:    s_or_b32 s2, s2, s3
+; GFX6-NEXT:    s_and_b32 s3, s7, 7
+; GFX6-NEXT:    s_lshl_b32 s3, s4, s3
+; GFX6-NEXT:    s_lshr_b32 s4, s1, 17
+; GFX6-NEXT:    s_andn2_b32 s6, 7, s7
+; GFX6-NEXT:    s_and_b32 s4, s4, s9
+; GFX6-NEXT:    s_lshr_b32 s4, s4, s6
+; GFX6-NEXT:    s_and_b32 s2, s2, s10
+; GFX6-NEXT:    s_or_b32 s3, s3, s4
+; GFX6-NEXT:    s_and_b32 s4, s8, 7
+; GFX6-NEXT:    s_andn2_b32 s6, 7, s8
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 25
+; GFX6-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX6-NEXT:    s_lshr_b32 s1, s1, s6
+; GFX6-NEXT:    s_and_b32 s0, s0, s10
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX6-NEXT:    s_or_b32 s1, s4, s1
+; GFX6-NEXT:    s_or_b32 s0, s0, s2
+; GFX6-NEXT:    s_and_b32 s2, s3, s10
+; GFX6-NEXT:    s_and_b32 s1, s1, s10
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_or_b32 s0, s0, s2
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_v4i8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_movk_i32 s13, 0xff
+; GFX8-NEXT:    s_lshr_b32 s6, s1, 8
+; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s8, s1, 24
+; GFX8-NEXT:    s_and_b32 s1, s1, s13
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX8-NEXT:    s_lshr_b32 s11, s2, 24
+; GFX8-NEXT:    s_and_b32 s12, s2, 7
+; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s9, 7
+; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX8-NEXT:    s_and_b32 s3, s6, s13
+; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX8-NEXT:    s_andn2_b32 s2, 7, s9
+; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX8-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s2, s10, 7
+; GFX8-NEXT:    s_lshl_b32 s2, s4, s2
+; GFX8-NEXT:    s_and_b32 s4, s7, s13
+; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT:    s_andn2_b32 s3, 7, s10
+; GFX8-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX8-NEXT:    s_lshr_b32 s3, s4, s3
+; GFX8-NEXT:    s_and_b32 s1, s1, s13
+; GFX8-NEXT:    s_or_b32 s2, s2, s3
+; GFX8-NEXT:    s_and_b32 s3, s11, 7
+; GFX8-NEXT:    s_lshl_b32 s3, s5, s3
+; GFX8-NEXT:    s_andn2_b32 s4, 7, s11
+; GFX8-NEXT:    s_lshr_b32 s5, s8, 1
+; GFX8-NEXT:    s_and_b32 s0, s0, s13
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX8-NEXT:    s_lshr_b32 s4, s5, s4
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s2, s13
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s3, s3, s4
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s3, s13
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_v4i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_movk_i32 s13, 0xff
+; GFX9-NEXT:    s_lshr_b32 s6, s1, 8
+; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s8, s1, 24
+; GFX9-NEXT:    s_and_b32 s1, s1, s13
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX9-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX9-NEXT:    s_lshr_b32 s11, s2, 24
+; GFX9-NEXT:    s_and_b32 s12, s2, 7
+; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s9, 7
+; GFX9-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX9-NEXT:    s_and_b32 s3, s6, s13
+; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX9-NEXT:    s_andn2_b32 s2, 7, s9
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX9-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s10, 7
+; GFX9-NEXT:    s_lshl_b32 s2, s4, s2
+; GFX9-NEXT:    s_and_b32 s4, s7, s13
+; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX9-NEXT:    s_andn2_b32 s3, 7, s10
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX9-NEXT:    s_lshr_b32 s3, s4, s3
+; GFX9-NEXT:    s_and_b32 s1, s1, s13
+; GFX9-NEXT:    s_or_b32 s2, s2, s3
+; GFX9-NEXT:    s_and_b32 s3, s11, 7
+; GFX9-NEXT:    s_lshl_b32 s3, s5, s3
+; GFX9-NEXT:    s_andn2_b32 s4, 7, s11
+; GFX9-NEXT:    s_lshr_b32 s5, s8, 1
+; GFX9-NEXT:    s_and_b32 s0, s0, s13
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX9-NEXT:    s_lshr_b32 s4, s5, s4
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s2, s13
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX9-NEXT:    s_or_b32 s3, s3, s4
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s3, s13
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_v4i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_movk_i32 s11, 0xff
+; GFX10-NEXT:    s_lshr_b32 s6, s1, 8
+; GFX10-NEXT:    s_lshr_b32 s7, s1, 16
+; GFX10-NEXT:    s_lshr_b32 s8, s1, 24
+; GFX10-NEXT:    s_and_b32 s1, s1, s11
+; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX10-NEXT:    s_lshr_b32 s12, s2, 24
+; GFX10-NEXT:    s_and_b32 s13, s2, 7
+; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX10-NEXT:    s_and_b32 s2, s6, s11
+; GFX10-NEXT:    s_and_b32 s6, s9, 7
+; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX10-NEXT:    s_andn2_b32 s9, 7, s9
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX10-NEXT:    s_lshl_b32 s3, s3, s6
+; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s13
+; GFX10-NEXT:    s_lshr_b32 s2, s2, s9
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    s_or_b32 s1, s3, s2
+; GFX10-NEXT:    s_and_b32 s2, s7, s11
+; GFX10-NEXT:    s_and_b32 s3, s10, 7
+; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX10-NEXT:    s_andn2_b32 s6, 7, s10
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
+; GFX10-NEXT:    s_lshr_b32 s2, s2, s6
+; GFX10-NEXT:    s_and_b32 s4, s12, 7
+; GFX10-NEXT:    s_andn2_b32 s6, 7, s12
+; GFX10-NEXT:    s_lshr_b32 s7, s8, 1
+; GFX10-NEXT:    s_or_b32 s2, s3, s2
+; GFX10-NEXT:    s_and_b32 s1, s1, s11
+; GFX10-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX10-NEXT:    s_lshr_b32 s5, s7, s6
+; GFX10-NEXT:    s_and_b32 s0, s0, s11
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX10-NEXT:    s_or_b32 s3, s4, s5
+; GFX10-NEXT:    s_and_b32 s2, s2, s11
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    s_lshl_b32 s1, s2, 16
+; GFX10-NEXT:    s_and_b32 s2, s3, s11
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    s_lshl_b32 s1, s2, 24
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %lhs = bitcast i32 %lhs.arg to <4 x i8>
+  %rhs = bitcast i32 %rhs.arg to <4 x i8>
+  %amt = bitcast i32 %amt.arg to <4 x i8>
+  %result = call <4 x i8> @llvm.fshl.v4i8(<4 x i8> %lhs, <4 x i8> %rhs, <4 x i8> %amt)
+  %cast.result = bitcast <4 x i8> %result to i32
+  ret i32 %cast.result
+}
+
+define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
+; GFX6-LABEL: v_fshl_v4i8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v10, 0xff, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
+; GFX6-NEXT:    v_and_b32_e32 v9, 7, v2
+; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 1, v10
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v9, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v2, v10
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 7, v6
+; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v2, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 9, v1
+; GFX6-NEXT:    s_movk_i32 s4, 0x7f
+; GFX6-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX6-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v6, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT:    v_and_b32_e32 v3, 7, v7
+; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v3, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 17, v1
+; GFX6-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX6-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX6-NEXT:    v_mov_b32_e32 v9, 0xff
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v6, v4
+; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v8
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v9
+; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, 7, v8
+; GFX6-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 25, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v6, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, v0, v9
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, v3, v9
+; GFX6-NEXT:    v_and_b32_e32 v1, v1, v9
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_v4i8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v8, 7, v2
+; GFX8-NEXT:    v_mov_b32_e32 v10, 1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v8, v8, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v2, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v8, v2
+; GFX8-NEXT:    v_and_b32_e32 v8, 7, v5
+; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v8, v3
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0xff
+; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v6
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, 7, v6
+; GFX8-NEXT:    v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
+; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 1, v6
+; GFX8-NEXT:    v_lshrrev_b16_e32 v5, v5, v6
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_xor_b32_e32 v6, -1, v7
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 7, v7
+; GFX8-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v6, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v2, s4, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_v4i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v8, 7, v2
+; GFX9-NEXT:    v_mov_b32_e32 v10, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v8, v8, v0
+; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v2, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, v8, v2
+; GFX9-NEXT:    v_and_b32_e32 v8, 7, v5
+; GFX9-NEXT:    v_xor_b32_e32 v5, -1, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b16_e32 v4, v5, v4
+; GFX9-NEXT:    v_lshlrev_b16_e32 v3, v8, v3
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0xff
+; GFX9-NEXT:    v_xor_b32_e32 v5, -1, v6
+; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 7, v6
+; GFX9-NEXT:    v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
+; GFX9-NEXT:    v_lshrrev_b16_e32 v6, 1, v6
+; GFX9-NEXT:    v_lshrrev_b16_e32 v5, v5, v6
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v7
+; GFX9-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX9-NEXT:    v_and_b32_e32 v5, 7, v7
+; GFX9-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v6, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 8
+; GFX9-NEXT:    s_movk_i32 s4, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v2, s4, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, s4, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_v4i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX10-NEXT:    v_and_b32_e32 v11, 7, v2
+; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, v11, v0
+; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v8
+; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX10-NEXT:    v_mov_b32_e32 v15, 0xff
+; GFX10-NEXT:    s_movk_i32 s4, 0xff
+; GFX10-NEXT:    v_xor_b32_e32 v13, -1, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX10-NEXT:    v_and_b32_e32 v12, s4, v1
+; GFX10-NEXT:    v_and_b32_e32 v6, s4, v6
+; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b16_e64 v3, v8, v3
+; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v9
+; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT:    v_and_b32_e32 v9, 7, v9
+; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
+; GFX10-NEXT:    v_lshrrev_b16_e64 v6, 1, v6
+; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v13, 7, v13
+; GFX10-NEXT:    v_lshrrev_b16_e64 v7, 1, v7
+; GFX10-NEXT:    v_lshrrev_b16_e64 v6, v11, v6
+; GFX10-NEXT:    v_lshlrev_b16_e64 v2, v2, v5
+; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
+; GFX10-NEXT:    v_lshrrev_b16_e64 v12, 1, v12
+; GFX10-NEXT:    v_lshrrev_b16_e64 v5, v13, v7
+; GFX10-NEXT:    v_lshlrev_b16_e64 v4, v9, v4
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, v8, v1
+; GFX10-NEXT:    v_or_b32_e32 v3, v3, v6
+; GFX10-NEXT:    v_lshrrev_b16_e64 v7, v10, v12
+; GFX10-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, 8
+; GFX10-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX10-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %lhs = bitcast i32 %lhs.arg to <4 x i8>
+  %rhs = bitcast i32 %rhs.arg to <4 x i8>
+  %amt = bitcast i32 %amt.arg to <4 x i8>
+  %result = call <4 x i8> @llvm.fshl.v4i8(<4 x i8> %lhs, <4 x i8> %rhs, <4 x i8> %amt)
+  %cast.result = bitcast <4 x i8> %result to i32
+  ret i32 %cast.result
+}
+
+define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) {
+; GFX6-LABEL: s_fshl_i24:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    s_sub_i32 s3, 0, 24
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX6-NEXT:    s_mov_b32 s3, 0xffffff
+; GFX6-NEXT:    s_and_b32 s2, s2, s3
+; GFX6-NEXT:    s_and_b32 s1, s1, s3
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 24, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 24, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 23, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX6-NEXT:    v_lshl_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_lshr_b32_e32 v1, s1, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_i24:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    s_sub_i32 s3, 0, 24
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX8-NEXT:    s_mov_b32 s3, 0xffffff
+; GFX8-NEXT:    s_and_b32 s2, s2, s3
+; GFX8-NEXT:    s_and_b32 s1, s1, s3
+; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, 24, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, 24, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 23, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s0
+; GFX8-NEXT:    v_lshrrev_b32_e64 v1, v1, s1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_i24:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_sub_i32 s3, 0, 24
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX9-NEXT:    s_mov_b32 s3, 0xffffff
+; GFX9-NEXT:    s_and_b32 s2, s2, s3
+; GFX9-NEXT:    s_and_b32 s1, s1, s3
+; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v1, 24, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v1, 24, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v1, 23, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX9-NEXT:    v_lshrrev_b32_e64 v1, v1, s1
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, v0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_i24:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
+; GFX10-NEXT:    s_sub_i32 s3, 0, 24
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX10-NEXT:    s_mov_b32 s3, 0xffffff
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    s_and_b32 s1, s1, s3
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX10-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 24, v0
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 24, v0
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 23, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX10-NEXT:    v_lshrrev_b32_e64 v1, v1, s1
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, v0, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i24 @llvm.fshl.i24(i24 %lhs, i24 %rhs, i24 %amt)
+  ret i24 %result
+}
+
+define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
+; GFX6-LABEL: v_fshl_i24:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX6-NEXT:    s_sub_i32 s4, 0, 24
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffffff
+; GFX6-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_mul_lo_u32 v3, v3, 24
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, v3, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_i24:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX8-NEXT:    s_sub_i32 s4, 0, 24
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX8-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GFX8-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
+; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0xffffff
+; GFX8-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_mul_lo_u32 v3, v3, 24
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, v3, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_i24:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX9-NEXT:    s_sub_i32 s4, 0, 24
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
+; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffff
+; GFX9-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 24
+; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v3, 23, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v2, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_i24:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
+; GFX10-NEXT:    s_sub_i32 s4, 0, 24
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX10-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GFX10-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffffff
+; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_mul_lo_u32 v3, v3, 24
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
+; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v2, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i24 @llvm.fshl.i24(i24 %lhs, i24 %rhs, i24 %amt)
+  ret i24 %result
+}
+
+define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 inreg %amt.arg) {
+; GFX6-LABEL: s_fshl_v2i24:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX6-NEXT:    s_movk_i32 s10, 0xff
+; GFX6-NEXT:    s_and_b32 s6, s6, s10
+; GFX6-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX6-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX6-NEXT:    s_and_b32 s0, s0, s10
+; GFX6-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX6-NEXT:    s_or_b32 s0, s0, s6
+; GFX6-NEXT:    s_and_b32 s6, s7, s10
+; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s9, s1, 8
+; GFX6-NEXT:    s_and_b32 s1, s1, s10
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX6-NEXT:    s_or_b32 s0, s0, s6
+; GFX6-NEXT:    s_and_b32 s6, s9, s10
+; GFX6-NEXT:    s_or_b32 s1, s8, s1
+; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX6-NEXT:    s_or_b32 s1, s1, s6
+; GFX6-NEXT:    s_lshr_b32 s6, s2, 8
+; GFX6-NEXT:    s_and_b32 s6, s6, s10
+; GFX6-NEXT:    s_lshr_b32 s7, s2, 16
+; GFX6-NEXT:    s_lshr_b32 s8, s2, 24
+; GFX6-NEXT:    s_and_b32 s2, s2, s10
+; GFX6-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX6-NEXT:    s_or_b32 s2, s2, s6
+; GFX6-NEXT:    s_and_b32 s6, s7, s10
+; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s9, s3, 8
+; GFX6-NEXT:    s_and_b32 s3, s3, s10
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX6-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
+; GFX6-NEXT:    s_or_b32 s2, s2, s6
+; GFX6-NEXT:    s_and_b32 s6, s9, s10
+; GFX6-NEXT:    s_or_b32 s3, s8, s3
+; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX6-NEXT:    s_or_b32 s3, s3, s6
+; GFX6-NEXT:    s_lshr_b32 s6, s4, 8
+; GFX6-NEXT:    s_and_b32 s6, s6, s10
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX6-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX6-NEXT:    s_lshr_b32 s8, s4, 24
+; GFX6-NEXT:    s_and_b32 s4, s4, s10
+; GFX6-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX6-NEXT:    s_or_b32 s4, s4, s6
+; GFX6-NEXT:    s_and_b32 s6, s7, s10
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX6-NEXT:    s_or_b32 s4, s4, s6
+; GFX6-NEXT:    s_sub_i32 s6, 0, 24
+; GFX6-NEXT:    v_mul_lo_u32 v1, s6, v0
+; GFX6-NEXT:    s_lshr_b32 s9, s5, 8
+; GFX6-NEXT:    s_and_b32 s5, s5, s10
+; GFX6-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_and_b32 s7, s9, s10
+; GFX6-NEXT:    s_or_b32 s5, s8, s5
+; GFX6-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v1, 24
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX6-NEXT:    s_or_b32 s5, s5, s7
+; GFX6-NEXT:    s_mov_b32 s7, 0xffffff
+; GFX6-NEXT:    v_mul_lo_u32 v3, s6, v1
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 23, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, s7, v0
+; GFX6-NEXT:    v_lshl_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX6-NEXT:    s_lshr_b32 s0, s2, 1
+; GFX6-NEXT:    v_and_b32_e32 v2, s7, v2
+; GFX6-NEXT:    v_lshr_b32_e32 v2, s0, v2
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v1
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v1
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffffff
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 23, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX6-NEXT:    s_lshr_b32 s0, s3, 1
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v1, s1, v1
+; GFX6-NEXT:    v_lshr_b32_e32 v2, s0, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, s10, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, s10, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, s10, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, s10, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, s10, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, s10, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_v2i24:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX8-NEXT:    s_movk_i32 s10, 0xff
+; GFX8-NEXT:    s_and_b32 s6, s6, s10
+; GFX8-NEXT:    s_bfe_u32 s11, 8, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX8-NEXT:    s_and_b32 s0, s0, s10
+; GFX8-NEXT:    s_lshl_b32 s6, s6, s11
+; GFX8-NEXT:    s_or_b32 s0, s0, s6
+; GFX8-NEXT:    s_and_b32 s6, s7, s10
+; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 8
+; GFX8-NEXT:    s_and_b32 s1, s1, s10
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s11
+; GFX8-NEXT:    s_or_b32 s0, s0, s6
+; GFX8-NEXT:    s_and_b32 s6, s9, s10
+; GFX8-NEXT:    s_or_b32 s1, s8, s1
+; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX8-NEXT:    s_or_b32 s1, s1, s6
+; GFX8-NEXT:    s_lshr_b32 s6, s2, 8
+; GFX8-NEXT:    s_and_b32 s6, s6, s10
+; GFX8-NEXT:    s_lshr_b32 s7, s2, 16
+; GFX8-NEXT:    s_lshr_b32 s8, s2, 24
+; GFX8-NEXT:    s_and_b32 s2, s2, s10
+; GFX8-NEXT:    s_lshl_b32 s6, s6, s11
+; GFX8-NEXT:    s_or_b32 s2, s2, s6
+; GFX8-NEXT:    s_and_b32 s6, s7, s10
+; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s9, s3, 8
+; GFX8-NEXT:    s_and_b32 s3, s3, s10
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX8-NEXT:    s_lshl_b32 s3, s3, s11
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
+; GFX8-NEXT:    s_or_b32 s2, s2, s6
+; GFX8-NEXT:    s_and_b32 s6, s9, s10
+; GFX8-NEXT:    s_or_b32 s3, s8, s3
+; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX8-NEXT:    s_or_b32 s3, s3, s6
+; GFX8-NEXT:    s_lshr_b32 s6, s4, 8
+; GFX8-NEXT:    s_and_b32 s6, s6, s10
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX8-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX8-NEXT:    s_lshr_b32 s8, s4, 24
+; GFX8-NEXT:    s_and_b32 s4, s4, s10
+; GFX8-NEXT:    s_lshl_b32 s6, s6, s11
+; GFX8-NEXT:    s_or_b32 s4, s4, s6
+; GFX8-NEXT:    s_and_b32 s6, s7, s10
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX8-NEXT:    s_or_b32 s4, s4, s6
+; GFX8-NEXT:    s_sub_i32 s6, 0, 24
+; GFX8-NEXT:    v_mul_lo_u32 v1, s6, v0
+; GFX8-NEXT:    s_lshr_b32 s9, s5, 8
+; GFX8-NEXT:    s_and_b32 s5, s5, s10
+; GFX8-NEXT:    s_lshl_b32 s5, s5, s11
+; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX8-NEXT:    s_and_b32 s7, s9, s10
+; GFX8-NEXT:    s_or_b32 s5, s8, s5
+; GFX8-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v1, 24
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX8-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX8-NEXT:    s_bfe_u32 s5, s5, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX8-NEXT:    s_or_b32 s5, s5, s7
+; GFX8-NEXT:    s_mov_b32 s7, 0xffffff
+; GFX8-NEXT:    v_mul_lo_u32 v3, s6, v1
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_mul_hi_u32 v1, s5, v1
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 23, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, s7, v0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s0
+; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX8-NEXT:    s_lshr_b32 s0, s2, 1
+; GFX8-NEXT:    v_and_b32_e32 v2, s7, v2
+; GFX8-NEXT:    v_lshrrev_b32_e64 v2, v2, s0
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s5, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v1
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v1
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0xffffff
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 23, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX8-NEXT:    s_lshr_b32 s0, s3, 1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s1
+; GFX8-NEXT:    v_lshrrev_b32_e64 v2, v2, s0
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s10
+; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, s10, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_v2i24:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s7, s0, 8
+; GFX9-NEXT:    s_movk_i32 s11, 0xff
+; GFX9-NEXT:    s_and_b32 s7, s7, s11
+; GFX9-NEXT:    s_bfe_u32 s12, 8, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s8, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s9, s0, 24
+; GFX9-NEXT:    s_and_b32 s0, s0, s11
+; GFX9-NEXT:    s_lshl_b32 s7, s7, s12
+; GFX9-NEXT:    s_or_b32 s0, s0, s7
+; GFX9-NEXT:    s_and_b32 s7, s8, s11
+; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s10, s1, 8
+; GFX9-NEXT:    s_and_b32 s1, s1, s11
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX9-NEXT:    s_lshl_b32 s1, s1, s12
+; GFX9-NEXT:    s_or_b32 s0, s0, s7
+; GFX9-NEXT:    s_and_b32 s7, s10, s11
+; GFX9-NEXT:    s_or_b32 s1, s9, s1
+; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX9-NEXT:    s_or_b32 s1, s1, s7
+; GFX9-NEXT:    s_lshr_b32 s7, s2, 8
+; GFX9-NEXT:    s_and_b32 s7, s7, s11
+; GFX9-NEXT:    s_lshr_b32 s8, s2, 16
+; GFX9-NEXT:    s_lshr_b32 s9, s2, 24
+; GFX9-NEXT:    s_and_b32 s2, s2, s11
+; GFX9-NEXT:    s_lshl_b32 s7, s7, s12
+; GFX9-NEXT:    s_or_b32 s2, s2, s7
+; GFX9-NEXT:    s_and_b32 s7, s8, s11
+; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s10, s3, 8
+; GFX9-NEXT:    s_and_b32 s3, s3, s11
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX9-NEXT:    s_or_b32 s2, s2, s7
+; GFX9-NEXT:    s_and_b32 s7, s10, s11
+; GFX9-NEXT:    s_lshl_b32 s3, s3, s12
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
+; GFX9-NEXT:    s_or_b32 s3, s9, s3
+; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX9-NEXT:    s_or_b32 s3, s3, s7
+; GFX9-NEXT:    s_lshr_b32 s7, s4, 8
+; GFX9-NEXT:    s_and_b32 s7, s7, s11
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    s_lshr_b32 s8, s4, 16
+; GFX9-NEXT:    s_lshr_b32 s9, s4, 24
+; GFX9-NEXT:    s_and_b32 s4, s4, s11
+; GFX9-NEXT:    s_lshl_b32 s7, s7, s12
+; GFX9-NEXT:    s_or_b32 s4, s4, s7
+; GFX9-NEXT:    s_and_b32 s7, s8, s11
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX9-NEXT:    s_or_b32 s4, s4, s7
+; GFX9-NEXT:    s_sub_i32 s7, 0, 24
+; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v0
+; GFX9-NEXT:    s_lshr_b32 s10, s5, 8
+; GFX9-NEXT:    s_and_b32 s5, s5, s11
+; GFX9-NEXT:    s_lshl_b32 s5, s5, s12
+; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX9-NEXT:    s_and_b32 s8, s10, s11
+; GFX9-NEXT:    s_or_b32 s5, s9, s5
+; GFX9-NEXT:    s_bfe_u32 s8, s8, 0x100000
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, 24
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX9-NEXT:    s_bfe_u32 s5, s5, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX9-NEXT:    s_or_b32 s5, s5, s8
+; GFX9-NEXT:    s_mov_b32 s8, 0xffffff
+; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v1
+; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, 23, v0
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX9-NEXT:    v_and_b32_e32 v2, s8, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, s8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e64 v2, v2, s2
+; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, v0, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffff
+; GFX9-NEXT:    v_sub_u32_e32 v2, 23, v1
+; GFX9-NEXT:    s_lshr_b32 s0, s3, 1
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshrrev_b32_e64 v2, v2, s0
+; GFX9-NEXT:    v_lshl_or_b32 v1, s1, v1, v2
+; GFX9-NEXT:    s_mov_b32 s6, 8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_e32 v4, s11, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX9-NEXT:    v_and_or_b32 v2, v0, s11, v2
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX9-NEXT:    v_or3_b32 v0, v2, v0, v4
+; GFX9-NEXT:    v_and_or_b32 v1, v3, s11, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_v2i24:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, 24
+; GFX10-NEXT:    s_movk_i32 s8, 0xff
+; GFX10-NEXT:    s_lshr_b32 s11, s1, 8
+; GFX10-NEXT:    s_bfe_u32 s10, 8, 0x100000
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX10-NEXT:    s_and_b32 s1, s1, s8
+; GFX10-NEXT:    s_lshr_b32 s9, s0, 24
+; GFX10-NEXT:    s_lshl_b32 s1, s1, s10
+; GFX10-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX10-NEXT:    s_or_b32 s1, s9, s1
+; GFX10-NEXT:    s_sub_i32 s9, 0, 24
+; GFX10-NEXT:    s_and_b32 s6, s6, s8
+; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX10-NEXT:    s_and_b32 s0, s0, s8
+; GFX10-NEXT:    s_lshl_b32 s6, s6, s10
+; GFX10-NEXT:    s_lshr_b32 s12, s4, 24
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-NEXT:    s_or_b32 s0, s0, s6
+; GFX10-NEXT:    s_and_b32 s6, s7, s8
+; GFX10-NEXT:    s_lshr_b32 s7, s4, 8
+; GFX10-NEXT:    v_mul_lo_u32 v2, s9, v0
+; GFX10-NEXT:    v_mul_lo_u32 v3, s9, v1
+; GFX10-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX10-NEXT:    s_and_b32 s7, s7, s8
+; GFX10-NEXT:    s_or_b32 s0, s0, s6
+; GFX10-NEXT:    s_and_b32 s6, s11, s8
+; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX10-NEXT:    s_lshr_b32 s11, s4, 16
+; GFX10-NEXT:    s_and_b32 s4, s4, s8
+; GFX10-NEXT:    s_lshl_b32 s7, s7, s10
+; GFX10-NEXT:    s_and_b32 s9, s11, s8
+; GFX10-NEXT:    s_or_b32 s4, s4, s7
+; GFX10-NEXT:    s_bfe_u32 s7, s9, 0x100000
+; GFX10-NEXT:    s_lshr_b32 s13, s5, 8
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX10-NEXT:    v_mul_hi_u32 v2, v1, v3
+; GFX10-NEXT:    s_and_b32 s5, s5, s8
+; GFX10-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX10-NEXT:    s_lshl_b32 s5, s5, s10
+; GFX10-NEXT:    s_or_b32 s4, s4, s7
+; GFX10-NEXT:    s_and_b32 s7, s13, s8
+; GFX10-NEXT:    s_or_b32 s5, s12, s5
+; GFX10-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX10-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX10-NEXT:    s_bfe_u32 s5, s5, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX10-NEXT:    s_lshr_b32 s9, s2, 16
+; GFX10-NEXT:    s_or_b32 s5, s5, s7
+; GFX10-NEXT:    s_lshr_b32 s7, s2, 8
+; GFX10-NEXT:    v_mul_hi_u32 v1, s5, v1
+; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX10-NEXT:    s_lshr_b32 s11, s2, 24
+; GFX10-NEXT:    s_and_b32 s2, s2, s8
+; GFX10-NEXT:    s_lshr_b32 s12, s3, 8
+; GFX10-NEXT:    s_and_b32 s3, s3, s8
+; GFX10-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s3, s3, s10
+; GFX10-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_and_b32 s4, s7, s8
+; GFX10-NEXT:    s_and_b32 s7, s9, s8
+; GFX10-NEXT:    s_lshl_b32 s4, s4, s10
+; GFX10-NEXT:    s_or_b32 s3, s11, s3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s5, v1
+; GFX10-NEXT:    s_or_b32 s2, s2, s4
+; GFX10-NEXT:    s_bfe_u32 s4, s7, 0x100000
+; GFX10-NEXT:    s_mov_b32 s5, 0xffffff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
+; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
+; GFX10-NEXT:    s_or_b32 s2, s2, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX10-NEXT:    s_and_b32 s4, s12, s8
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX10-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v0, s5, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, s5, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffffff
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 23, v1
+; GFX10-NEXT:    v_lshrrev_b32_e64 v2, v2, s2
+; GFX10-NEXT:    s_bfe_u32 s2, s3, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s3, s4, 16
+; GFX10-NEXT:    v_and_b32_e32 v4, v4, v3
+; GFX10-NEXT:    s_or_b32 s2, s2, s3
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, v0, v2
+; GFX10-NEXT:    s_lshr_b32 s0, s2, 1
+; GFX10-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX10-NEXT:    v_lshrrev_b32_e64 v2, v4, s0
+; GFX10-NEXT:    s_or_b32 s0, s1, s6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, v1, v2
+; GFX10-NEXT:    s_mov_b32 s0, 8
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_and_b32_e32 v3, s8, v1
+; GFX10-NEXT:    v_and_b32_sdwa v4, v1, s8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX10-NEXT:    v_and_or_b32 v2, v0, s8, v2
+; GFX10-NEXT:    v_and_b32_sdwa v0, v0, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX10-NEXT:    v_or3_b32 v0, v2, v0, v3
+; GFX10-NEXT:    v_and_or_b32 v1, v1, s8, v4
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-NEXT:    ; return to shader part epilog
+  %lhs = bitcast i48 %lhs.arg to <2 x i24>
+  %rhs = bitcast i48 %rhs.arg to <2 x i24>
+  %amt = bitcast i48 %amt.arg to <2 x i24>
+  %result = call <2 x i24> @llvm.fshl.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt)
+  %cast.result = bitcast <2 x i24> %result to i48
+  ret i48 %cast.result
+}
+
+define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
+; GFX6-LABEL: v_fshl_v2i24:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GFX6-NEXT:    s_sub_i32 s4, 0, 24
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v8, 24
+; GFX6-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX6-NEXT:    v_mul_lo_u32 v7, s4, v6
+; GFX6-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GFX6-NEXT:    v_mul_hi_u32 v6, v4, v6
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v8
+; GFX6-NEXT:    v_mov_b32_e32 v8, 0xffffff
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v8
+; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX6-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
+; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 24, v4
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 24, v4
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v6, s4, v7
+; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 23, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, v4, v8
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_mul_hi_u32 v6, v7, v6
+; GFX6-NEXT:    v_and_b32_e32 v4, v5, v8
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v6
+; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
+; GFX6-NEXT:    v_and_b32_e32 v6, v9, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_mul_lo_u32 v5, v5, 24
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v4, v5
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 24, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 24, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 23, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v8
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_and_b32_e32 v2, v3, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, v4, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_v2i24:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GFX8-NEXT:    s_sub_i32 s4, 0, 24
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v8, 24
+; GFX8-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX8-NEXT:    v_mul_lo_u32 v7, s4, v6
+; GFX8-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
+; GFX8-NEXT:    v_mul_hi_u32 v6, v4, v6
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v7, v8
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0xffffff
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v8
+; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX8-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v6
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 24, v4
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 24, v4
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v6, s4, v7
+; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, 23, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, v4, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_mul_hi_u32 v6, v7, v6
+; GFX8-NEXT:    v_and_b32_e32 v4, v5, v8
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v6
+; GFX8-NEXT:    v_mul_hi_u32 v5, v4, v5
+; GFX8-NEXT:    v_and_b32_e32 v6, v9, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mul_lo_u32 v5, v5, 24
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v4, v5
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 24, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 24, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 23, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_and_b32_e32 v2, v3, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, v4, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_v2i24:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GFX9-NEXT:    s_sub_i32 s4, 0, 24
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v8, 24
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v8
+; GFX9-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX9-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0xffffff
+; GFX9-NEXT:    v_mul_lo_u32 v7, s4, v6
+; GFX9-NEXT:    v_and_b32_e32 v5, v5, v9
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, v3, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v7
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v8
+; GFX9-NEXT:    v_mul_hi_u32 v6, v4, v6
+; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v7
+; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX9-NEXT:    v_mul_hi_u32 v8, v7, v8
+; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v6
+; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
+; GFX9-NEXT:    v_add_u32_e32 v7, v7, v8
+; GFX9-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v7, 24
+; GFX9-NEXT:    v_sub_u32_e32 v6, 23, v4
+; GFX9-NEXT:    v_and_b32_e32 v6, v6, v9
+; GFX9-NEXT:    v_and_b32_e32 v4, v4, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v4, v2
+; GFX9-NEXT:    v_sub_u32_e32 v2, v5, v7
+; GFX9-NEXT:    v_subrev_u32_e32 v4, 24, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v4, 24, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v4, 23, v2
+; GFX9-NEXT:    v_and_b32_e32 v4, v4, v9
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, v4, v3
+; GFX9-NEXT:    v_lshl_or_b32 v1, v1, v2, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_v2i24:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v7, 24
+; GFX10-NEXT:    s_sub_i32 s4, 0, 24
+; GFX10-NEXT:    v_mov_b32_e32 v12, 0xffffff
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v7, v7
+; GFX10-NEXT:    v_and_b32_e32 v5, v5, v12
+; GFX10-NEXT:    v_and_b32_e32 v2, v2, v12
+; GFX10-NEXT:    v_and_b32_e32 v3, v3, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GFX10-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX10-NEXT:    v_mul_lo_u32 v8, s4, v6
+; GFX10-NEXT:    v_mul_lo_u32 v9, s4, v7
+; GFX10-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GFX10-NEXT:    v_mul_hi_u32 v9, v7, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v7, v9
+; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v6
+; GFX10-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GFX10-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX10-NEXT:    v_mul_lo_u32 v7, v7, 24
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 23, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v11, v6, v12
+; GFX10-NEXT:    v_sub_nc_u32_e32 v7, 23, v5
+; GFX10-NEXT:    v_and_b32_e32 v10, v5, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v11, v2
+; GFX10-NEXT:    v_and_b32_e32 v6, v7, v12
+; GFX10-NEXT:    v_and_b32_e32 v7, v15, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v6, v3
+; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v7, v2
+; GFX10-NEXT:    v_lshl_or_b32 v1, v1, v10, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x i24> @llvm.fshl.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt)
+  ret <2 x i24> %result
+}
+
+define amdgpu_ps i32 @s_fshl_i32(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt) {
+; GFX6-LABEL: s_fshl_i32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    s_not_b32 s1, s2
+; GFX6-NEXT:    v_alignbit_b32 v0, s0, v0, 1
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    s_not_b32 s1, s2
+; GFX8-NEXT:    v_alignbit_b32 v0, s0, v0, 1
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    s_not_b32 s1, s2
+; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, 1
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_alignbit_b32 v0, s0, s1, 1
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX10-NEXT:    s_not_b32 s1, s2
+; GFX10-NEXT:    v_alignbit_b32 v0, s0, v0, s1
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt)
+  ret i32 %result
+}
+
+define amdgpu_ps i32 @s_fshl_i32_5(i32 inreg %lhs, i32 inreg %rhs) {
+; GFX6-LABEL: s_fshl_i32_5:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    v_alignbit_b32 v0, s0, v0, -5
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_i32_5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_alignbit_b32 v0, s0, v0, -5
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_i32_5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, -5
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_i32_5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_alignbit_b32 v0, s0, s1, -5
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 5)
+  ret i32 %result
+}
+
+define amdgpu_ps i32 @s_fshl_i32_8(i32 inreg %lhs, i32 inreg %rhs) {
+; GFX6-LABEL: s_fshl_i32_8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    v_alignbit_b32 v0, s0, v0, -8
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_i32_8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_alignbit_b32 v0, s0, v0, -8
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_i32_8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, -8
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_i32_8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_alignbit_b32 v0, s0, s1, -8
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 8)
+  ret i32 %result
+}
+
+define i32 @v_fshl_i32(i32 %lhs, i32 %rhs, i32 %amt) {
+; GFX6-LABEL: v_fshl_i32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_alignbit_b32 v1, v0, v1, 1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_alignbit_b32 v1, v0, v1, 1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v1, v0, v1, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_alignbit_b32 v1, v0, v1, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt)
+  ret i32 %result
+}
+
+define i32 @v_fshl_i32_5(i32 %lhs, i32 %rhs) {
+; GFX6-LABEL: v_fshl_i32_5:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v1, -5
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_i32_5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, -5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_i32_5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v0, v0, v1, -5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_i32_5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, -5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 5)
+  ret i32 %result
+}
+
+define i32 @v_fshl_i32_8(i32 %lhs, i32 %rhs) {
+; GFX6-LABEL: v_fshl_i32_8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v1, -8
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_i32_8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, -8
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_i32_8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v0, v0, v1, -8
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_i32_8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, -8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 8)
+  ret i32 %result
+}
+
+define amdgpu_ps float @v_fshl_i32_ssv(i32 inreg %lhs, i32 inreg %rhs, i32 %amt) {
+; GFX6-LABEL: v_fshl_i32_ssv:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_alignbit_b32 v1, s0, v1, 1
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX6-NEXT:    v_alignbit_b32 v0, s0, v1, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshl_i32_ssv:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_alignbit_b32 v1, s0, v1, 1
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, s0, v1, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshl_i32_ssv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_alignbit_b32 v1, s0, v1, 1
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX9-NEXT:    v_alignbit_b32 v0, s0, v1, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshl_i32_ssv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_alignbit_b32 v1, s0, s1, 1
+; GFX10-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX10-NEXT:    v_alignbit_b32 v0, s0, v1, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt)
+  %cast.result = bitcast i32 %result to float
+  ret float %cast.result
+}
+
+define amdgpu_ps float @v_fshl_i32_svs(i32 inreg %lhs, i32 %rhs, i32 inreg %amt) {
+; GFX6-LABEL: v_fshl_i32_svs:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_not_b32 s1, s1
+; GFX6-NEXT:    v_alignbit_b32 v0, s0, v0, 1
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshl_i32_svs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_not_b32 s1, s1
+; GFX8-NEXT:    v_alignbit_b32 v0, s0, v0, 1
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshl_i32_svs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_not_b32 s1, s1
+; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, 1
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshl_i32_svs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_alignbit_b32 v0, s0, v0, 1
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX10-NEXT:    s_not_b32 s1, s1
+; GFX10-NEXT:    v_alignbit_b32 v0, s0, v0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt)
+  %cast.result = bitcast i32 %result to float
+  ret float %cast.result
+}
+
+define amdgpu_ps float @v_fshl_i32_vss(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt) {
+; GFX6-LABEL: v_fshl_i32_vss:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    s_not_b32 s1, s2
+; GFX6-NEXT:    v_alignbit_b32 v0, s0, v0, 1
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshl_i32_vss:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    s_not_b32 s1, s2
+; GFX8-NEXT:    v_alignbit_b32 v0, s0, v0, 1
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshl_i32_vss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    s_not_b32 s1, s2
+; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, 1
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshl_i32_vss:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_alignbit_b32 v0, s0, s1, 1
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX10-NEXT:    s_not_b32 s1, s2
+; GFX10-NEXT:    v_alignbit_b32 v0, s0, v0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt)
+  %cast.result = bitcast i32 %result to float
+  ret float %cast.result
+}
+
+define <2 x i32> @v_fshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) {
+; GFX6-LABEL: v_fshl_v2i32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_alignbit_b32 v2, v0, v2, 1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v2, v4
+; GFX6-NEXT:    v_alignbit_b32 v2, v1, v3, 1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_xor_b32_e32 v3, -1, v5
+; GFX6-NEXT:    v_alignbit_b32 v1, v1, v2, v3
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_v2i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_alignbit_b32 v2, v0, v2, 1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, v4
+; GFX8-NEXT:    v_alignbit_b32 v2, v1, v3, 1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v5
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v2, v0, v2, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX9-NEXT:    v_alignbit_b32 v0, v0, v2, v4
+; GFX9-NEXT:    v_alignbit_b32 v2, v1, v3, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v5
+; GFX9-NEXT:    v_alignbit_b32 v1, v1, v2, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_v2i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_alignbit_b32 v2, v0, v2, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 1, v0
+; GFX10-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX10-NEXT:    v_alignbit_b32 v3, v1, v3, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 1, v1
+; GFX10-NEXT:    v_xor_b32_e32 v5, -1, v5
+; GFX10-NEXT:    v_alignbit_b32 v0, v7, v2, v4
+; GFX10-NEXT:    v_alignbit_b32 v1, v6, v3, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt)
+  ret <2 x i32> %result
+}
+
+define <3 x i32> @v_fshl_v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) {
+; GFX6-LABEL: v_fshl_v3i32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_alignbit_b32 v3, v0, v3, 1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v6
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v3, v6
+; GFX6-NEXT:    v_alignbit_b32 v3, v1, v4, 1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v7
+; GFX6-NEXT:    v_alignbit_b32 v1, v1, v3, v4
+; GFX6-NEXT:    v_alignbit_b32 v3, v2, v5, 1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v8
+; GFX6-NEXT:    v_alignbit_b32 v2, v2, v3, v4
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_v3i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_alignbit_b32 v3, v0, v3, 1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_xor_b32_e32 v6, -1, v6
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, v6
+; GFX8-NEXT:    v_alignbit_b32 v3, v1, v4, 1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v7
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v3, v4
+; GFX8-NEXT:    v_alignbit_b32 v3, v2, v5, 1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v8
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v3, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_v3i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v3, v0, v3, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v6
+; GFX9-NEXT:    v_alignbit_b32 v0, v0, v3, v6
+; GFX9-NEXT:    v_alignbit_b32 v3, v1, v4, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v7
+; GFX9-NEXT:    v_alignbit_b32 v1, v1, v3, v4
+; GFX9-NEXT:    v_alignbit_b32 v3, v2, v5, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v8
+; GFX9-NEXT:    v_alignbit_b32 v2, v2, v3, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_v3i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_alignbit_b32 v3, v0, v3, 1
+; GFX10-NEXT:    v_alignbit_b32 v4, v1, v4, 1
+; GFX10-NEXT:    v_alignbit_b32 v5, v2, v5, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX10-NEXT:    v_alignbit_b32 v0, v0, v3, v6
+; GFX10-NEXT:    v_alignbit_b32 v1, v1, v4, v7
+; GFX10-NEXT:    v_alignbit_b32 v2, v2, v5, v8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <3 x i32> @llvm.fshl.v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt)
+  ret <3 x i32> %result
+}
+
+define <4 x i32> @v_fshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) {
+; GFX6-LABEL: v_fshl_v4i32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_alignbit_b32 v4, v0, v4, 1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v4, v8
+; GFX6-NEXT:    v_alignbit_b32 v4, v1, v5, 1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v9
+; GFX6-NEXT:    v_alignbit_b32 v1, v1, v4, v5
+; GFX6-NEXT:    v_alignbit_b32 v4, v2, v6, 1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v10
+; GFX6-NEXT:    v_alignbit_b32 v2, v2, v4, v5
+; GFX6-NEXT:    v_alignbit_b32 v4, v3, v7, 1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
+; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v11
+; GFX6-NEXT:    v_alignbit_b32 v3, v3, v4, v5
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_v4i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_alignbit_b32 v4, v0, v4, 1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, v8
+; GFX8-NEXT:    v_alignbit_b32 v4, v1, v5, 1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v9
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, v5
+; GFX8-NEXT:    v_alignbit_b32 v4, v2, v6, 1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v10
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v4, v5
+; GFX8-NEXT:    v_alignbit_b32 v4, v3, v7, 1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
+; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v11
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v4, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_v4i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v4, v0, v4, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX9-NEXT:    v_alignbit_b32 v0, v0, v4, v8
+; GFX9-NEXT:    v_alignbit_b32 v4, v1, v5, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_xor_b32_e32 v5, -1, v9
+; GFX9-NEXT:    v_alignbit_b32 v1, v1, v4, v5
+; GFX9-NEXT:    v_alignbit_b32 v4, v2, v6, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_xor_b32_e32 v5, -1, v10
+; GFX9-NEXT:    v_alignbit_b32 v2, v2, v4, v5
+; GFX9-NEXT:    v_alignbit_b32 v4, v3, v7, 1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_xor_b32_e32 v5, -1, v11
+; GFX9-NEXT:    v_alignbit_b32 v3, v3, v4, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_v4i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_alignbit_b32 v22, v1, v5, 1
+; GFX10-NEXT:    v_alignbit_b32 v18, v0, v4, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 1, v0
+; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 1, v1
+; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v9
+; GFX10-NEXT:    v_alignbit_b32 v5, v2, v6, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 1, v2
+; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v10
+; GFX10-NEXT:    v_alignbit_b32 v13, v3, v7, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 1, v3
+; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v11
+; GFX10-NEXT:    v_alignbit_b32 v0, v15, v18, v8
+; GFX10-NEXT:    v_alignbit_b32 v1, v19, v22, v9
+; GFX10-NEXT:    v_alignbit_b32 v2, v23, v5, v10
+; GFX10-NEXT:    v_alignbit_b32 v3, v14, v13, v11
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt)
+  ret <4 x i32> %result
+}
+
+define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt) {
+; GFX6-LABEL: s_fshl_i16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s3, s2, 15
+; GFX6-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX6-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s3, s2, 15
+; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX8-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s3, 1, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s3, s2, 15
+; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX9-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_bfe_u32 s3, 1, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b32 s3, s2, 15
+; GFX10-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s4, 1, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s4
+; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt)
+  ret i16 %result
+}
+
+define amdgpu_ps i16 @s_fshl_i16_4(i16 inreg %lhs, i16 inreg %rhs) {
+; GFX6-LABEL: s_fshl_i16_4:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 12
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_i16_4:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_bfe_u32 s2, 4, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s2, 12, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_i16_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_bfe_u32 s2, 4, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_bfe_u32 s2, 12, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_i16_4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_bfe_u32 s2, 4, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s3, 12, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 4)
+  ret i16 %result
+}
+
+define amdgpu_ps i16 @s_fshl_i16_5(i16 inreg %lhs, i16 inreg %rhs) {
+; GFX6-LABEL: s_fshl_i16_5:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 11
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_i16_5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_bfe_u32 s2, 5, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s2, 11, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_i16_5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_bfe_u32 s2, 5, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_bfe_u32 s2, 11, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_i16_5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_bfe_u32 s2, 5, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s3, 11, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 5)
+  ret i16 %result
+}
+
+define i16 @v_fshl_i16(i16 %lhs, i16 %rhs, i16 %amt) {
+; GFX6-LABEL: v_fshl_i16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v3, 15, v2
+; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_bfe_u32 v3, v3, 0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v3, 15, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 15, v2
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 1, v1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
+; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, v2, v0
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, v3, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt)
+  ret i16 %result
+}
+
+define i16 @v_fshl_i16_4(i16 %lhs, i16 %rhs) {
+; GFX6-LABEL: v_fshl_i16_4:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 12, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_i16_4:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 12, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_i16_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
+; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 12, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_i16_4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, 4, v0
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, 12, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 4)
+  ret i16 %result
+}
+
+define i16 @v_fshl_i16_5(i16 %lhs, i16 %rhs) {
+; GFX6-LABEL: v_fshl_i16_5:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 11, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_i16_5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 5, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 11, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_i16_5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 5, v0
+; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 11, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_i16_5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, 5, v0
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, 11, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 5)
+  ret i16 %result
+}
+
+define amdgpu_ps half @v_fshl_i16_ssv(i16 inreg %lhs, i16 inreg %rhs, i16 %amt) {
+; GFX6-LABEL: v_fshl_i16_ssv:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_and_b32_e32 v1, 15, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
+; GFX6-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX6-NEXT:    v_lshl_b32_e32 v1, s0, v1
+; GFX6-NEXT:    s_and_b32 s0, s1, 0xffff
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
+; GFX6-NEXT:    v_lshr_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshl_i16_ssv:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_and_b32_e32 v1, 15, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    v_lshlrev_b16_e64 v1, v1, s0
+; GFX8-NEXT:    s_bfe_u32 s0, s1, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s1, 1, 0x100000
+; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX8-NEXT:    v_lshrrev_b16_e64 v0, v0, s0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshl_i16_ssv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_and_b32_e32 v1, 15, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX9-NEXT:    v_lshlrev_b16_e64 v1, v1, s0
+; GFX9-NEXT:    s_bfe_u32 s0, s1, 0x100000
+; GFX9-NEXT:    s_bfe_u32 s1, 1, 0x100000
+; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX9-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX9-NEXT:    v_lshrrev_b16_e64 v0, v0, s0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshl_i16_ssv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s2, 1, 0x100000
+; GFX10-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, v0, s0
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, v1, s1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt)
+  %cast.result = bitcast i16 %result to half
+  ret half %cast.result
+}
+
+define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt) {
+; GFX6-LABEL: v_fshl_i16_svs:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s2, s1, 15
+; GFX6-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s1, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshl_i16_svs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s2, s1, 15
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 1, v0
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    v_lshrrev_b16_e32 v0, s1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshl_i16_svs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s2, s1, 15
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX9-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 1, v0
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX9-NEXT:    v_lshrrev_b16_e32 v0, s1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshl_i16_svs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_lshrrev_b16_e64 v0, 1, v0
+; GFX10-NEXT:    s_andn2_b32 s2, 15, s1
+; GFX10-NEXT:    s_and_b32 s1, s1, 15
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    v_lshrrev_b16_e64 v0, s2, v0
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt)
+  %cast.result = bitcast i16 %result to half
+  ret half %cast.result
+}
+
+define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt) {
+; GFX6-LABEL: v_fshl_i16_vss:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s2, s1, 15
+; GFX6-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
+; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshl_i16_vss:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s2, s1, 15
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, s2, v0
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s2, 1, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s2
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshl_i16_vss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s2, s1, 15
+; GFX9-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, s2, v0
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX9-NEXT:    s_bfe_u32 s2, 1, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s0, s0, s2
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshl_i16_vss:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b32 s2, s1, 15
+; GFX10-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s3, 1, 0x100000
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, s2, v0
+; GFX10-NEXT:    s_lshr_b32 s0, s0, s3
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt)
+  %cast.result = bitcast i16 %result to half
+  ret half %cast.result
+}
+
+define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) {
+; GFX6-LABEL: s_fshl_v2i16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s5, s2, 15
+; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX6-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX6-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s5
+; GFX6-NEXT:    s_and_b32 s5, s1, 0xffff
+; GFX6-NEXT:    s_lshr_b32 s5, s5, 1
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s2, s5, s2
+; GFX6-NEXT:    s_or_b32 s0, s0, s2
+; GFX6-NEXT:    s_and_b32 s2, s4, 15
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT:    s_andn2_b32 s4, 15, s4
+; GFX6-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 17
+; GFX6-NEXT:    s_bfe_u32 s3, s4, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_v2i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s6, s2, 15
+; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX8-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s6
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s6, 1, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s6
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s5, 15
+; GFX8-NEXT:    s_andn2_b32 s2, 15, s5
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX8-NEXT:    s_lshr_b32 s3, s4, s6
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000f
+; GFX9-NEXT:    s_and_b32 s4, s2, s3
+; GFX9-NEXT:    s_andn2_b32 s2, s3, s2
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s4, 16
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX9-NEXT:    s_lshl_b32 s3, s3, s5
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
+; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX9-NEXT:    s_and_b32 s1, s1, s4
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
+; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX9-NEXT:    s_and_b32 s1, s1, s4
+; GFX9-NEXT:    s_and_b32 s2, s2, s4
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s3, s5
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_v2i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 s5, 0xffff
+; GFX10-NEXT:    s_mov_b32 s3, 0xf000f
+; GFX10-NEXT:    s_and_b32 s7, s1, s5
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX10-NEXT:    s_lshr_b32 s7, s7, 1
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX10-NEXT:    s_and_b32 s4, s2, s3
+; GFX10-NEXT:    s_andn2_b32 s2, s3, s2
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s7, s1
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s6, s4, 16
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX10-NEXT:    s_and_b32 s1, s1, s5
+; GFX10-NEXT:    s_and_b32 s5, s2, s5
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX10-NEXT:    s_lshl_b32 s3, s3, s6
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s5
+; GFX10-NEXT:    s_lshr_b32 s2, s4, s2
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt)
+  %cast = bitcast <2 x i16> %result to i32
+  ret i32 %cast
+}
+
+define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
+; GFX6-LABEL: v_fshl_v2i16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v5, 15, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX6-NEXT:    v_bfe_u32 v5, v5, 0, 16
+; GFX6-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v5, v0
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 1, v5
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v2, v5
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 15, v4
+; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
+; GFX6-NEXT:    v_and_b32_e32 v4, 15, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v2, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 17, v1
+; GFX6-NEXT:    v_bfe_u32 v3, v4, 0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_v2i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT:    v_lshrrev_b16_e32 v5, 1, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v4, v4, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v2, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v3
+; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v3
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v4, 1
+; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v3, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xf000f
+; GFX9-NEXT:    v_and_b32_e32 v3, s4, v2
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 1, v1 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
+; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_v2i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
+; GFX10-NEXT:    s_mov_b32 s4, 0xf000f
+; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 1, v1 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
+; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt)
+  ret <2 x i16> %result
+}
+
+define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
+; GFX6-LABEL: v_fshl_v2i16_4_8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; GFX6-NEXT:    s_bfe_u32 s4, 4, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
+; GFX6-NEXT:    s_bfe_u32 s4, 11, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, s4, v3
+; GFX6-NEXT:    s_bfe_u32 s4, 8, 0x100000
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, s4, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 17, v1
+; GFX6-NEXT:    s_bfe_u32 s4, 7, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_v2i16_4_8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 1, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 11, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, 1
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 7, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_v2i16_4_8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, 16
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, 16
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX9-NEXT:    s_sub_i32 s4, 0, 16
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, s4, v2
+; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v3
+; GFX9-NEXT:    v_mul_hi_u32 v4, v2, v4
+; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
+; GFX9-NEXT:    v_mul_hi_u32 v2, 4, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
+; GFX9-NEXT:    v_mul_hi_u32 v3, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX9-NEXT:    v_sub_u32_e32 v2, 4, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v4, 16, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 16, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, 16, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 16, v2
+; GFX9-NEXT:    v_sub_u32_e32 v3, 8, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v4, 16, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 16, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v4, 16, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 16, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v4, v3
+; GFX9-NEXT:    v_pk_sub_i16 v3, 16, v2 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
+; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_v2i16_4_8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v2, 16
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, 16
+; GFX10-NEXT:    s_sub_i32 s4, 0, 16
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX10-NEXT:    v_mul_lo_u32 v4, s4, v2
+; GFX10-NEXT:    v_mul_lo_u32 v5, s4, v3
+; GFX10-NEXT:    v_mul_hi_u32 v4, v2, v4
+; GFX10-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v5
+; GFX10-NEXT:    v_mul_hi_u32 v2, 8, v2
+; GFX10-NEXT:    v_mul_hi_u32 v3, 4, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 8, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 4, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v4, 16, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v4, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX10-NEXT:    v_and_or_b32 v2, v3, 0xffff, v2
+; GFX10-NEXT:    v_pk_sub_i16 v3, 16, v2 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
+; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> <i16 4, i16 8>)
+  ret <2 x i16> %result
+}
+
+define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> %amt) {
+; GFX6-LABEL: v_fshl_v2i16_ssv:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_and_b32_e32 v2, 15, v0
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX6-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
+; GFX6-NEXT:    s_and_b32 s0, s1, 0xffff
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
+; GFX6-NEXT:    v_lshr_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, 15, v1
+; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 17
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
+; GFX6-NEXT:    v_lshl_b32_e32 v2, s2, v2
+; GFX6-NEXT:    v_lshr_b32_e32 v1, s0, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshl_v2i16_ssv:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_and_b32_e32 v2, 15, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    v_lshlrev_b16_e64 v2, v2, s0
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_bfe_u32 s0, s1, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s1, 1, 0x100000
+; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX8-NEXT:    v_lshrrev_b16_e64 v0, v0, s0
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 15, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX8-NEXT:    s_lshr_b32 s0, s3, s1
+; GFX8-NEXT:    v_lshlrev_b16_e64 v2, v2, s2
+; GFX8-NEXT:    v_lshrrev_b16_e64 v1, v1, s0
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshl_v2i16_ssv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s2, 0xf000f
+; GFX9-NEXT:    v_and_b32_e32 v1, s2, v0
+; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v1, s0
+; GFX9-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s1, s0
+; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v0, s0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshl_v2i16_ssv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v0
+; GFX10-NEXT:    s_mov_b32 s2, 0xf000f
+; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, s2, v1
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX10-NEXT:    s_lshr_b32 s2, s3, 1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v0, s0
+; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v1, s1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt)
+  %cast = bitcast <2 x i16> %result to float
+  ret float %cast
+}
+
+define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) {
+; GFX6-LABEL: v_fshl_v2i16_svs:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX6-NEXT:    s_and_b32 s4, s1, 15
+; GFX6-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX6-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s1, v1
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX6-NEXT:    v_or_b32_e32 v1, s0, v1
+; GFX6-NEXT:    s_and_b32 s0, s3, 15
+; GFX6-NEXT:    s_andn2_b32 s1, 15, s3
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 17, v0
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s0, s2, s0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s1, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshl_v2i16_svs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s4, s1, 15
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v0
+; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, s1, v1
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX8-NEXT:    v_or_b32_e32 v1, s0, v1
+; GFX8-NEXT:    s_and_b32 s0, s3, 15
+; GFX8-NEXT:    v_mov_b32_e32 v2, 1
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s3
+; GFX8-NEXT:    s_lshl_b32 s0, s2, s0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v0, s1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshl_v2i16_svs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s2, 0xf000f
+; GFX9-NEXT:    s_and_b32 s3, s1, s2
+; GFX9-NEXT:    s_andn2_b32 s1, s2, s1
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 1, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX9-NEXT:    s_lshl_b32 s2, s2, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    v_pk_lshrrev_b16 v0, s1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshl_v2i16_svs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 s2, 0xf000f
+; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 1, v0 op_sel_hi:[0,1]
+; GFX10-NEXT:    s_and_b32 s3, s1, s2
+; GFX10-NEXT:    s_andn2_b32 s1, s2, s1
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX10-NEXT:    v_pk_lshrrev_b16 v0, s1, v0
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX10-NEXT:    s_lshl_b32 s1, s2, s4
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt)
+  %cast = bitcast <2 x i16> %result to float
+  ret float %cast
+}
+
+define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) {
+; GFX6-LABEL: v_fshl_v2i16_vss:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s3, s1, 15
+; GFX6-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX6-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s3, v0
+; GFX6-NEXT:    s_and_b32 s3, s0, 0xffff
+; GFX6-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s1, s3, s1
+; GFX6-NEXT:    v_or_b32_e32 v0, s1, v0
+; GFX6-NEXT:    s_and_b32 s1, s2, 15
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s1, v1
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 17
+; GFX6-NEXT:    s_bfe_u32 s1, s2, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX6-NEXT:    v_or_b32_e32 v1, s0, v1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshl_v2i16_vss:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s4, s1, 15
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, s4, v0
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s4, 1, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s4
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s3
+; GFX8-NEXT:    v_or_b32_e32 v1, s0, v1
+; GFX8-NEXT:    s_and_b32 s0, s3, 15
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    s_lshr_b32 s0, s2, s4
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshl_v2i16_vss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s2, 0xf000f
+; GFX9-NEXT:    s_and_b32 s3, s1, s2
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s3, v0
+; GFX9-NEXT:    s_mov_b32 s3, 0xffff
+; GFX9-NEXT:    s_andn2_b32 s1, s2, s1
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-NEXT:    s_and_b32 s0, s0, s3
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX9-NEXT:    s_and_b32 s0, s0, s3
+; GFX9-NEXT:    s_and_b32 s1, s1, s3
+; GFX9-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX9-NEXT:    s_lshr_b32 s1, s2, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshl_v2i16_vss:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 s3, 0xffff
+; GFX10-NEXT:    s_mov_b32 s2, 0xf000f
+; GFX10-NEXT:    s_and_b32 s5, s0, s3
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s5, s5, 1
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX10-NEXT:    s_and_b32 s4, s1, s2
+; GFX10-NEXT:    s_andn2_b32 s1, s2, s1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s5, s0
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s4, v0
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-NEXT:    s_and_b32 s0, s0, s3
+; GFX10-NEXT:    s_and_b32 s3, s1, s3
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX10-NEXT:    s_lshr_b32 s0, s0, s3
+; GFX10-NEXT:    s_lshr_b32 s1, s2, s1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt)
+  %cast = bitcast <2 x i16> %result to float
+  ret float %cast
+}
+
+; ; FIXME
+; define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <3 x i16> inreg %amt) {
+;   %result = call <3 x i16> @llvm.fshl.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
+;   %cast = bitcast <3 x i16> %result to i48
+;   ret i48 %cast
+; }
+
+; ; FIXME
+; define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) {
+;   %result = call <3 x i16> @llvm.fshl.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
+;   %cast.result = bitcast <3 x i16> %result to <3 x half>
+;   ret <3 x half> %cast.result
+; }
+
+define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs, <4 x i16> inreg %amt) {
+; GFX6-LABEL: s_fshl_v4i16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s12, s8, 15
+; GFX6-NEXT:    s_bfe_u32 s12, s12, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX6-NEXT:    s_mov_b32 s12, 0xffff
+; GFX6-NEXT:    s_andn2_b32 s8, 15, s8
+; GFX6-NEXT:    s_and_b32 s4, s4, s12
+; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s4, s4, s8
+; GFX6-NEXT:    s_or_b32 s0, s0, s4
+; GFX6-NEXT:    s_and_b32 s4, s9, 15
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s1, s1, s4
+; GFX6-NEXT:    s_and_b32 s4, s5, s12
+; GFX6-NEXT:    s_andn2_b32 s8, 15, s9
+; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX6-NEXT:    s_bfe_u32 s5, s8, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
+; GFX6-NEXT:    s_or_b32 s1, s1, s4
+; GFX6-NEXT:    s_and_b32 s4, s10, 15
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT:    s_andn2_b32 s5, 15, s10
+; GFX6-NEXT:    s_lshl_b32 s2, s2, s4
+; GFX6-NEXT:    s_and_b32 s4, s6, s12
+; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
+; GFX6-NEXT:    s_or_b32 s2, s2, s4
+; GFX6-NEXT:    s_and_b32 s4, s11, 15
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT:    s_andn2_b32 s5, 15, s11
+; GFX6-NEXT:    s_lshl_b32 s3, s3, s4
+; GFX6-NEXT:    s_and_b32 s4, s7, s12
+; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
+; GFX6-NEXT:    s_or_b32 s3, s3, s4
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_bfe_u32 s1, s2, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s2, s3, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_or_b32 s1, s1, s2
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_v4i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s12, s4, 15
+; GFX8-NEXT:    s_bfe_u32 s12, s12, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s8, s2, 16
+; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
+; GFX8-NEXT:    s_andn2_b32 s4, 15, s4
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s12, 1, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s2, s2, s12
+; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s2, s2, s4
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, s10, 15
+; GFX8-NEXT:    s_andn2_b32 s4, 15, s10
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s2, s6, s2
+; GFX8-NEXT:    s_lshr_b32 s6, s8, s12
+; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s4, s6, s4
+; GFX8-NEXT:    s_or_b32 s2, s2, s4
+; GFX8-NEXT:    s_and_b32 s4, s5, 15
+; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
+; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
+; GFX8-NEXT:    s_andn2_b32 s5, 15, s5
+; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
+; GFX8-NEXT:    s_bfe_u32 s4, s5, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s3, s3, s12
+; GFX8-NEXT:    s_lshr_b32 s3, s3, s4
+; GFX8-NEXT:    s_or_b32 s1, s1, s3
+; GFX8-NEXT:    s_and_b32 s3, s11, 15
+; GFX8-NEXT:    s_andn2_b32 s4, 15, s11
+; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s5, s9, s12
+; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s3, s7, s3
+; GFX8-NEXT:    s_lshr_b32 s4, s5, s4
+; GFX8-NEXT:    s_or_b32 s3, s3, s4
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_bfe_u32 s2, s3, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_v4i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s6, 0xf000f
+; GFX9-NEXT:    s_and_b32 s7, s4, s6
+; GFX9-NEXT:    s_lshr_b32 s8, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s9, s7, 16
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s7
+; GFX9-NEXT:    s_lshl_b32 s7, s8, s9
+; GFX9-NEXT:    s_mov_b32 s8, 0xffff
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s7
+; GFX9-NEXT:    s_lshr_b32 s7, s2, 16
+; GFX9-NEXT:    s_and_b32 s2, s2, s8
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX9-NEXT:    s_lshr_b32 s7, s7, 1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s7
+; GFX9-NEXT:    s_andn2_b32 s4, s6, s4
+; GFX9-NEXT:    s_lshr_b32 s7, s2, 16
+; GFX9-NEXT:    s_lshr_b32 s9, s4, 16
+; GFX9-NEXT:    s_and_b32 s2, s2, s8
+; GFX9-NEXT:    s_and_b32 s4, s4, s8
+; GFX9-NEXT:    s_lshr_b32 s2, s2, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s7, s9
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s2, s5, s6
+; GFX9-NEXT:    s_andn2_b32 s4, s6, s5
+; GFX9-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX9-NEXT:    s_lshl_b32 s2, s5, s6
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX9-NEXT:    s_and_b32 s3, s3, s8
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
+; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s4, 16
+; GFX9-NEXT:    s_and_b32 s2, s2, s8
+; GFX9-NEXT:    s_and_b32 s4, s4, s8
+; GFX9-NEXT:    s_lshr_b32 s2, s2, s4
+; GFX9-NEXT:    s_lshr_b32 s3, s3, s5
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_v4i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 s9, 0xffff
+; GFX10-NEXT:    s_mov_b32 s6, 0xf000f
+; GFX10-NEXT:    s_and_b32 s11, s2, s9
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX10-NEXT:    s_and_b32 s7, s4, s6
+; GFX10-NEXT:    s_lshr_b32 s11, s11, 1
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX10-NEXT:    s_lshr_b32 s8, s0, 16
+; GFX10-NEXT:    s_andn2_b32 s4, s6, s4
+; GFX10-NEXT:    s_lshr_b32 s10, s7, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s11, s2
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s7
+; GFX10-NEXT:    s_lshl_b32 s7, s8, s10
+; GFX10-NEXT:    s_lshr_b32 s8, s2, 16
+; GFX10-NEXT:    s_and_b32 s10, s4, s9
+; GFX10-NEXT:    s_and_b32 s2, s2, s9
+; GFX10-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX10-NEXT:    s_lshr_b32 s2, s2, s10
+; GFX10-NEXT:    s_lshr_b32 s4, s8, s4
+; GFX10-NEXT:    s_and_b32 s8, s3, s9
+; GFX10-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX10-NEXT:    s_and_b32 s4, s5, s6
+; GFX10-NEXT:    s_lshr_b32 s8, s8, 1
+; GFX10-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s7
+; GFX10-NEXT:    s_andn2_b32 s5, s6, s5
+; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX10-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s8, s3
+; GFX10-NEXT:    s_lshl_b32 s1, s1, s4
+; GFX10-NEXT:    s_lshl_b32 s4, s6, s7
+; GFX10-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX10-NEXT:    s_and_b32 s7, s5, s9
+; GFX10-NEXT:    s_and_b32 s3, s3, s9
+; GFX10-NEXT:    s_lshr_b32 s5, s5, 16
+; GFX10-NEXT:    s_lshr_b32 s3, s3, s7
+; GFX10-NEXT:    s_lshr_b32 s5, s6, s5
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
+; GFX10-NEXT:    s_or_b32 s1, s1, s3
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
+  %cast.result = bitcast <4 x i16> %result to <2 x i32>
+  ret <2 x i32> %cast.result
+}
+
+define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) {
+; GFX6-LABEL: v_fshl_v4i16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v12, 15, v8
+; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX6-NEXT:    s_mov_b32 s4, 0xffff
+; GFX6-NEXT:    v_and_b32_e32 v8, 15, v8
+; GFX6-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX6-NEXT:    v_bfe_u32 v12, v12, 0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_bfe_u32 v8, v8, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v12, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v8, v4
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, 15, v9
+; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v9
+; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
+; GFX6-NEXT:    v_and_b32_e32 v4, s4, v5
+; GFX6-NEXT:    v_and_b32_e32 v8, 15, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_bfe_u32 v5, v8, 0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v5, v4
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, 15, v10
+; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v10
+; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
+; GFX6-NEXT:    v_mov_b32_e32 v12, 0xffff
+; GFX6-NEXT:    v_and_b32_e32 v5, 15, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, v6, v12
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_bfe_u32 v5, v5, 0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v5, v4
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, 15, v11
+; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v11
+; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
+; GFX6-NEXT:    v_and_b32_e32 v5, 15, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v4, v3
+; GFX6-NEXT:    v_and_b32_e32 v4, v7, v12
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_bfe_u32 v5, v5, 0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v5, v4
+; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_v4i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v8, 15, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
+; GFX8-NEXT:    v_lshrrev_b16_e32 v9, 1, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v8, v8, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v4, v9
+; GFX8-NEXT:    v_or_b32_e32 v4, v8, v4
+; GFX8-NEXT:    v_and_b32_e32 v8, 15, v6
+; GFX8-NEXT:    v_xor_b32_e32 v6, -1, v6
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v8, 1
+; GFX8-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v6, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 15, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 15, v5
+; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 1, v3
+; GFX8-NEXT:    v_lshrrev_b16_e32 v5, v5, v6
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, v2, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 15, v7
+; GFX8-NEXT:    v_xor_b32_e32 v6, -1, v7
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v5, 1
+; GFX8-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v3, v6, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_v4i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xf000f
+; GFX9-NEXT:    v_and_b32_e32 v6, s4, v4
+; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX9-NEXT:    v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v4, v2
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v6, v0
+; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v5
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, s4, v5
+; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v2, v1
+; GFX9-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX9-NEXT:    v_pk_lshrrev_b16 v2, 1, v3 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v4, v2
+; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_v4i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v4
+; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v5
+; GFX10-NEXT:    s_mov_b32 s4, 0xf000f
+; GFX10-NEXT:    v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_and_b32_e32 v11, s4, v4
+; GFX10-NEXT:    v_and_b32_e32 v15, s4, v6
+; GFX10-NEXT:    v_and_b32_e32 v19, s4, v5
+; GFX10-NEXT:    v_and_b32_e32 v6, s4, v7
+; GFX10-NEXT:    v_pk_lshrrev_b16 v3, 1, v3 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v11, v0
+; GFX10-NEXT:    v_pk_lshrrev_b16 v2, v15, v2
+; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v19, v1
+; GFX10-NEXT:    v_pk_lshrrev_b16 v3, v6, v3
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
+  %cast.result = bitcast <4 x i16> %result to <4 x half>
+  ret <4 x half> %cast.result
+}
+
+define amdgpu_ps i64 @s_fshl_i64(i64 inreg %lhs, i64 inreg %rhs, i64 inreg %amt) {
+; GCN-LABEL: s_fshl_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_and_b64 s[6:7], s[4:5], 63
+; GCN-NEXT:    s_andn2_b64 s[4:5], 63, s[4:5]
+; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], s6
+; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    ; return to shader part epilog
+  %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt)
+  ret i64 %result
+}
+
+define amdgpu_ps i64 @s_fshl_i64_5(i64 inreg %lhs, i64 inreg %rhs) {
+; GCN-LABEL: s_fshl_i64_5:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_lshr_b32 s2, s3, 27
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], 5
+; GCN-NEXT:    s_mov_b32 s3, 0
+; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    ; return to shader part epilog
+  %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 5)
+  ret i64 %result
+}
+
+define amdgpu_ps i64 @s_fshl_i64_32(i64 inreg %lhs, i64 inreg %rhs) {
+; GCN-LABEL: s_fshl_i64_32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s0, 0
+; GCN-NEXT:    s_mov_b32 s2, s3
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    ; return to shader part epilog
+  %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 32)
+  ret i64 %result
+}
+
+define amdgpu_ps i64 @s_fshl_i64_48(i64 inreg %lhs, i64 inreg %rhs) {
+; GCN-LABEL: s_fshl_i64_48:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_lshl_b32 s1, s0, 16
+; GCN-NEXT:    s_mov_b32 s0, 0
+; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 16
+; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    ; return to shader part epilog
+  %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 48)
+  ret i64 %result
+}
+
+define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
+; GFX6-LABEL: v_fshl_i64:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v5, 63, v4
+; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], 1
+; GFX6-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v5
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v4
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v5, 63, v4
+; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX8-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v5, 63, v4
+; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX9-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_xor_b32_e32 v5, -1, v4
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX10-NEXT:    v_and_b32_e32 v7, 63, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 63, v5
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt)
+  ret i64 %result
+}
+
+define i64 @v_fshl_i64_5(i64 %lhs, i64 %rhs) {
+; GFX6-LABEL: v_fshl_i64_5:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 27, v3
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_i64_5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 5, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 27, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_i64_5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 5, v[0:1]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 27, v3
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_i64_5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 5, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 27, v3
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 5)
+  ret i64 %result
+}
+
+define i64 @v_fshl_i64_32(i64 %lhs, i64 %rhs) {
+; GFX6-LABEL: v_fshl_i64_32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_i64_32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_i64_32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_i64_32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 32)
+  ret i64 %result
+}
+
+define i64 @v_fshl_i64_48(i64 %lhs, i64 %rhs) {
+; GFX6-LABEL: v_fshl_i64_48:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[2:3], 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_i64_48:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[2:3]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_i64_48:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[2:3]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_i64_48:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 16, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 48)
+  ret i64 %result
+}
+
+define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 %amt) {
+; GFX6-LABEL: v_fshl_i64_ssv:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_and_b32_e32 v1, 63, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, 63, v0
+; GFX6-NEXT:    v_lshl_b64 v[0:1], s[0:1], v1
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[2:3], 1
+; GFX6-NEXT:    v_lshr_b64 v[2:3], s[0:1], v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshl_i64_ssv:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_and_b32_e32 v1, 63, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 63, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v1, s[0:1]
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[2:3], 1
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshl_i64_ssv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_and_b32_e32 v1, 63, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 63, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v1, s[0:1]
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[2:3], 1
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshl_i64_ssv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX10-NEXT:    v_and_b32_e32 v2, 63, v1
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, s[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, s[2:3]
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt)
+  %cast = bitcast i64 %result to <2 x float>
+  ret <2 x float> %cast
+}
+
+define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg %amt) {
+; GFX6-LABEL: v_fshl_i64_svs:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 1
+; GFX6-NEXT:    s_and_b64 s[4:5], s[2:3], 63
+; GFX6-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], s2
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, s1, v1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshl_i64_svs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
+; GFX8-NEXT:    s_and_b64 s[4:5], s[2:3], 63
+; GFX8-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s2, v[0:1]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, s1, v1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshl_i64_svs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
+; GFX9-NEXT:    s_and_b64 s[4:5], s[2:3], 63
+; GFX9-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s2, v[0:1]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, s1, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshl_i64_svs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
+; GFX10-NEXT:    s_andn2_b64 s[4:5], 63, s[2:3]
+; GFX10-NEXT:    s_and_b64 s[2:3], s[2:3], 63
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, s1, v1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt)
+  %cast = bitcast i64 %result to <2 x float>
+  ret <2 x float> %cast
+}
+
+define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg %amt) {
+; GFX6-LABEL: v_fshl_i64_vss:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b64 s[4:5], s[2:3], 63
+; GFX6-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s4
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, s1, v1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshl_i64_vss:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b64 s[4:5], s[2:3], 63
+; GFX8-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, s1, v1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshl_i64_vss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b64 s[4:5], s[2:3], 63
+; GFX9-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, s1, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshl_i64_vss:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b64 s[4:5], s[2:3], 63
+; GFX10-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, s1, v1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt)
+  %cast = bitcast i64 %result to <2 x float>
+  ret <2 x float> %cast
+}
+
+define amdgpu_ps <2 x i64> @s_fshl_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs, <2 x i64> inreg %amt) {
+; GFX6-LABEL: s_fshl_v2i64:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b64 s[12:13], s[8:9], 63
+; GFX6-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX6-NEXT:    s_and_b64 s[4:5], s[10:11], 63
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX6-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_v2i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b64 s[12:13], s[8:9], 63
+; GFX8-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT:    s_and_b64 s[4:5], s[10:11], 63
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX8-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_v2i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b64 s[12:13], s[8:9], 63
+; GFX9-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT:    s_and_b64 s[4:5], s[10:11], 63
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX9-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_v2i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b64 s[12:13], s[8:9], 63
+; GFX10-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX10-NEXT:    s_and_b64 s[8:9], s[10:11], 63
+; GFX10-NEXT:    s_andn2_b64 s[10:11], 63, s[10:11]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
+; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s10
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt)
+  ret <2 x i64> %result
+}
+
+define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
+; GFX6-LABEL: v_fshl_v2i64:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v9, 63, v8
+; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], 1
+; GFX6-NEXT:    v_and_b32_e32 v8, 63, v8
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], v8
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v9
+; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v10
+; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], 1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, 63, v10
+; GFX6-NEXT:    v_and_b32_e32 v8, 63, v8
+; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], v4
+; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v8
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX6-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_v2i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v9, 63, v8
+; GFX8-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v8, 63, v8
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX8-NEXT:    v_xor_b32_e32 v8, -1, v10
+; GFX8-NEXT:    v_lshrrev_b64 v[6:7], 1, v[6:7]
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, 63, v10
+; GFX8-NEXT:    v_and_b32_e32 v8, 63, v8
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v8, v[6:7]
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_v2i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v9, 63, v8
+; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v10
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], 1, v[6:7]
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 63, v10
+; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v8, v[6:7]
+; GFX9-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX9-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX9-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_v2i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v8
+; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v10
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
+; GFX10-NEXT:    v_lshrrev_b64 v[6:7], 1, v[6:7]
+; GFX10-NEXT:    v_and_b32_e32 v15, 63, v8
+; GFX10-NEXT:    v_and_b32_e32 v19, 63, v9
+; GFX10-NEXT:    v_and_b32_e32 v13, 63, v11
+; GFX10-NEXT:    v_and_b32_e32 v9, 63, v10
+; GFX10-NEXT:    v_lshlrev_b64 v[11:12], v15, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v19, v[4:5]
+; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v13, v[6:7]
+; GFX10-NEXT:    v_lshlrev_b64 v[15:16], v9, v[2:3]
+; GFX10-NEXT:    v_or_b32_e32 v0, v11, v4
+; GFX10-NEXT:    v_or_b32_e32 v1, v12, v5
+; GFX10-NEXT:    v_or_b32_e32 v2, v15, v6
+; GFX10-NEXT:    v_or_b32_e32 v3, v16, v7
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt)
+  ret <2 x i64> %result
+}
+
+define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) {
+; GFX6-LABEL: s_fshl_i128:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_movk_i32 s10, 0x7f
+; GFX6-NEXT:    s_mov_b32 s11, 0
+; GFX6-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
+; GFX6-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
+; GFX6-NEXT:    s_sub_i32 s9, s12, 64
+; GFX6-NEXT:    s_sub_i32 s13, 64, s12
+; GFX6-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], s12
+; GFX6-NEXT:    s_lshr_b64 s[14:15], s[0:1], s13
+; GFX6-NEXT:    s_lshl_b64 s[12:13], s[2:3], s12
+; GFX6-NEXT:    s_or_b64 s[12:13], s[14:15], s[12:13]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX6-NEXT:    s_sub_i32 s9, 1, 64
+; GFX6-NEXT:    s_sub_i32 s14, 64, 1
+; GFX6-NEXT:    s_cmp_lt_u32 1, 64
+; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 1, 0
+; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[12:13], s[4:5], 1
+; GFX6-NEXT:    s_lshl_b64 s[14:15], s[6:7], s14
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[6:7], 1
+; GFX6-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], s9
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[12:13], s[6:7]
+; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX6-NEXT:    s_sub_i32 s14, s8, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s8
+; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
+; GFX6-NEXT:    s_lshl_b64 s[12:13], s[0:1], s12
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
+; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], s14
+; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[6:7], 0
+; GFX6-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_i128:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_movk_i32 s10, 0x7f
+; GFX8-NEXT:    s_mov_b32 s11, 0
+; GFX8-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
+; GFX8-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
+; GFX8-NEXT:    s_sub_i32 s9, s12, 64
+; GFX8-NEXT:    s_sub_i32 s13, 64, s12
+; GFX8-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], s12
+; GFX8-NEXT:    s_lshr_b64 s[14:15], s[0:1], s13
+; GFX8-NEXT:    s_lshl_b64 s[12:13], s[2:3], s12
+; GFX8-NEXT:    s_or_b64 s[12:13], s[14:15], s[12:13]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX8-NEXT:    s_sub_i32 s9, 1, 64
+; GFX8-NEXT:    s_sub_i32 s14, 64, 1
+; GFX8-NEXT:    s_cmp_lt_u32 1, 64
+; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 1, 0
+; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[12:13], s[4:5], 1
+; GFX8-NEXT:    s_lshl_b64 s[14:15], s[6:7], s14
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], 1
+; GFX8-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[6:7], s9
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[12:13], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX8-NEXT:    s_sub_i32 s14, s8, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s8
+; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
+; GFX8-NEXT:    s_lshl_b64 s[12:13], s[0:1], s12
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
+; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], s14
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[6:7], 0
+; GFX8-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_i128:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_movk_i32 s10, 0x7f
+; GFX9-NEXT:    s_mov_b32 s11, 0
+; GFX9-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
+; GFX9-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
+; GFX9-NEXT:    s_sub_i32 s9, s12, 64
+; GFX9-NEXT:    s_sub_i32 s13, 64, s12
+; GFX9-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], s12
+; GFX9-NEXT:    s_lshr_b64 s[14:15], s[0:1], s13
+; GFX9-NEXT:    s_lshl_b64 s[12:13], s[2:3], s12
+; GFX9-NEXT:    s_or_b64 s[12:13], s[14:15], s[12:13]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    s_sub_i32 s9, 1, 64
+; GFX9-NEXT:    s_sub_i32 s14, 64, 1
+; GFX9-NEXT:    s_cmp_lt_u32 1, 64
+; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 1, 0
+; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[12:13], s[4:5], 1
+; GFX9-NEXT:    s_lshl_b64 s[14:15], s[6:7], s14
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], 1
+; GFX9-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[6:7], s9
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[12:13], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX9-NEXT:    s_sub_i32 s14, s8, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s8
+; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
+; GFX9-NEXT:    s_lshl_b64 s[12:13], s[0:1], s12
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
+; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s14
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[6:7], 0
+; GFX9-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_i128:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_movk_i32 s10, 0x7f
+; GFX10-NEXT:    s_mov_b32 s11, 0
+; GFX10-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
+; GFX10-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
+; GFX10-NEXT:    s_sub_i32 s9, s12, 64
+; GFX10-NEXT:    s_sub_i32 s10, 64, s12
+; GFX10-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_lshl_b64 s[14:15], s[2:3], s12
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s10
+; GFX10-NEXT:    s_lshl_b64 s[12:13], s[0:1], s12
+; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b64 s[12:13], s[12:13], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX10-NEXT:    s_sub_i32 s9, 1, 64
+; GFX10-NEXT:    s_sub_i32 s10, 64, 1
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], s10
+; GFX10-NEXT:    s_lshr_b64 s[14:15], s[6:7], 1
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s9
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[14:15], 0
+; GFX10-NEXT:    s_sub_i32 s14, s8, 64
+; GFX10-NEXT:    s_sub_i32 s9, 64, s8
+; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[4:5], s9
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
+; GFX10-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
+  ret i128 %result
+}
+
+define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
+; GFX6-LABEL: v_fshl_i128:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_movk_i32 s4, 0x7f
+; GFX6-NEXT:    v_and_b32_e32 v14, s4, v8
+; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX6-NEXT:    v_and_b32_e32 v15, s4, v8
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v14
+; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, 64, v14
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[0:1], v8
+; GFX6-NEXT:    v_lshl_b64 v[10:11], v[2:3], v14
+; GFX6-NEXT:    v_lshl_b64 v[12:13], v[0:1], v14
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v16
+; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX6-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
+; GFX6-NEXT:    s_sub_i32 s4, 1, 64
+; GFX6-NEXT:    s_sub_i32 s5, 64, 1
+; GFX6-NEXT:    s_cmp_lt_u32 1, 64
+; GFX6-NEXT:    v_cndmask_b32_e32 v12, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v13, v1, v3, vcc
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], 1
+; GFX6-NEXT:    v_lshl_b64 v[2:3], v[6:7], s5
+; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 1, 0
+; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[6:7], s4
+; GFX6-NEXT:    s_and_b32 s4, 1, s6
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    s_and_b32 s4, 1, s7
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[6:7], 1
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    s_and_b32 s4, 1, s6
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 64, v15
+; GFX6-NEXT:    v_subrev_i32_e32 v14, vcc, 64, v15
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], v15
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], v6
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], v15
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v14
+; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX6-NEXT:    v_or_b32_e32 v0, v10, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, v11, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, v12, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v13, v3
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_i128:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_movk_i32 s4, 0x7f
+; GFX8-NEXT:    v_and_b32_e32 v14, s4, v8
+; GFX8-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX8-NEXT:    v_and_b32_e32 v15, s4, v8
+; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v14
+; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, 64, v14
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v8, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v14, v[2:3]
+; GFX8-NEXT:    v_lshlrev_b64 v[12:13], v14, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v16, v[0:1]
+; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
+; GFX8-NEXT:    s_sub_i32 s4, 1, 64
+; GFX8-NEXT:    s_sub_i32 s5, 64, 1
+; GFX8-NEXT:    s_cmp_lt_u32 1, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v1, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], s5, v[6:7]
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 1, 0
+; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s4, v[6:7]
+; GFX8-NEXT:    s_and_b32 s4, 1, s6
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    s_and_b32 s4, 1, s7
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], 1, v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    s_and_b32 s4, 1, s6
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 64, v15
+; GFX8-NEXT:    v_subrev_u32_e32 v14, vcc, 64, v15
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v15, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v6, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v15, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v14, v[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX8-NEXT:    v_or_b32_e32 v0, v10, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v11, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v12, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v13, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_i128:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0x7f
+; GFX9-NEXT:    v_and_b32_e32 v14, s4, v8
+; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX9-NEXT:    v_and_b32_e32 v15, s4, v8
+; GFX9-NEXT:    v_sub_u32_e32 v8, 64, v14
+; GFX9-NEXT:    v_subrev_u32_e32 v16, 64, v14
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v8, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v14, v[2:3]
+; GFX9-NEXT:    v_lshlrev_b64 v[12:13], v14, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v16, v[0:1]
+; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
+; GFX9-NEXT:    s_sub_i32 s4, 1, 64
+; GFX9-NEXT:    s_sub_i32 s5, 64, 1
+; GFX9-NEXT:    s_cmp_lt_u32 1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v1, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], s5, v[6:7]
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 1, 0
+; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s4, v[6:7]
+; GFX9-NEXT:    s_and_b32 s4, 1, s6
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    s_and_b32 s4, 1, s7
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], 1, v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    s_and_b32 s4, 1, s6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v6, 64, v15
+; GFX9-NEXT:    v_subrev_u32_e32 v14, 64, v15
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v15, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], v6, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v15, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v14, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX9-NEXT:    v_or_b32_e32 v0, v10, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, v11, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, v12, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v13, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_i128:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v8
+; GFX10-NEXT:    s_movk_i32 s4, 0x7f
+; GFX10-NEXT:    v_mov_b32_e32 v27, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, s4, v8
+; GFX10-NEXT:    v_mov_b32_e32 v28, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, s4, v9
+; GFX10-NEXT:    s_sub_i32 s4, 64, 1
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[10:11], s4, v[6:7]
+; GFX10-NEXT:    s_sub_i32 s4, 1, 64
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    v_lshrrev_b64 v[15:16], s4, v[6:7]
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX10-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_and_b32 s5, 1, vcc_lo
+; GFX10-NEXT:    s_and_b32 s4, 1, s4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v14, 64, v18
+; GFX10-NEXT:    v_lshrrev_b64 v[6:7], 1, v[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v15, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s5
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v14, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v18, v[27:28]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0, v7, s4
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v23, 64, v18
+; GFX10-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v19
+; GFX10-NEXT:    v_lshrrev_b64 v[14:15], v19, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v16, v[21:22]
+; GFX10-NEXT:    v_lshlrev_b64 v[12:13], v18, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v23, v[0:1]
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
+; GFX10-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v8, v[21:22]
+; GFX10-NEXT:    v_or_b32_e32 v14, v14, v16
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v19
+; GFX10-NEXT:    v_or_b32_e32 v15, v15, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v0, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v1, v11, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v19, v[21:22]
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v8, v14, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v19
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v18
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v9, v15, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v31, 0, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v23, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v10, v27, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, v1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v28, s6
+; GFX10-NEXT:    v_or_b32_e32 v0, v31, v4
+; GFX10-NEXT:    v_or_b32_e32 v1, v7, v5
+; GFX10-NEXT:    v_or_b32_e32 v2, v15, v6
+; GFX10-NEXT:    v_or_b32_e32 v3, v3, v8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
+  ret i128 %result
+}
+
+define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) {
+; GFX6-LABEL: v_fshl_i128_ssv:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_movk_i32 s8, 0x7f
+; GFX6-NEXT:    v_and_b32_e32 v6, s8, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX6-NEXT:    v_and_b32_e32 v7, s8, v0
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v6
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v0
+; GFX6-NEXT:    v_lshl_b64 v[2:3], s[2:3], v6
+; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 64, v6
+; GFX6-NEXT:    s_sub_i32 s10, 1, 64
+; GFX6-NEXT:    s_sub_i32 s8, 64, 1
+; GFX6-NEXT:    s_cmp_lt_u32 1, 64
+; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX6-NEXT:    v_lshl_b64 v[0:1], s[0:1], v8
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX6-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[4:5], 1
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
+; GFX6-NEXT:    v_lshl_b64 v[4:5], s[0:1], v6
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[6:7], 1
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], s10
+; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[6:7]
+; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v7
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[2:3], v7
+; GFX6-NEXT:    v_lshl_b64 v[2:3], s[0:1], v2
+; GFX6-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v7
+; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v11
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX6-NEXT:    v_lshr_b64 v[4:5], s[0:1], v7
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX6-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; GFX6-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshl_i128_ssv:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_movk_i32 s8, 0x7f
+; GFX8-NEXT:    v_and_b32_e32 v6, s8, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    v_and_b32_e32 v7, s8, v0
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v6
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v6, s[2:3]
+; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 64, v6
+; GFX8-NEXT:    s_sub_i32 s10, 1, 64
+; GFX8-NEXT:    s_sub_i32 s8, 64, 1
+; GFX8-NEXT:    s_cmp_lt_u32 1, 64
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, s[0:1]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 1, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[4:5], 1
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, s[0:1]
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], 1
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[6:7], s10
+; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v7
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v7, s[2:3]
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, 64, v7
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v11, s[0:1]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v7, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshl_i128_ssv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_movk_i32 s8, 0x7f
+; GFX9-NEXT:    v_and_b32_e32 v6, s8, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX9-NEXT:    v_and_b32_e32 v7, s8, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v6
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v6, s[2:3]
+; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v6
+; GFX9-NEXT:    s_sub_i32 s10, 1, 64
+; GFX9-NEXT:    s_sub_i32 s8, 64, 1
+; GFX9-NEXT:    s_cmp_lt_u32 1, 64
+; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, s[0:1]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[4:5], 1
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, s[0:1]
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], 1
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[6:7], s10
+; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v7, s[2:3]
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, s[0:1]
+; GFX9-NEXT:    v_subrev_u32_e32 v11, 64, v7
+; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v11, s[0:1]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v7, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; GFX9-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshl_i128_ssv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_movk_i32 s8, 0x7f
+; GFX10-NEXT:    s_sub_i32 s14, 1, 64
+; GFX10-NEXT:    v_and_b32_e32 v12, s8, v0
+; GFX10-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX10-NEXT:    s_sub_i32 s10, 64, 1
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 64, v12
+; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    v_and_b32_e32 v13, s8, v0
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v1, s[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v12, s[2:3]
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[4:5], 1
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], s10
+; GFX10-NEXT:    s_lshr_b64 s[12:13], s[6:7], 1
+; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX10-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 64, v13
+; GFX10-NEXT:    s_cselect_b64 s[6:7], s[4:5], s[6:7]
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[12:13], 0
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v14, 64, v13
+; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v13, s[6:7]
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v0, s[8:9]
+; GFX10-NEXT:    v_lshlrev_b64 v[15:16], v10, s[0:1]
+; GFX10-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v12
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v14, s[8:9]
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v12, s[0:1]
+; GFX10-NEXT:    v_or_b32_e32 v6, v6, v8
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 64, v13
+; GFX10-NEXT:    v_or_b32_e32 v7, v7, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v15, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v16, v3, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v13, s[8:9]
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 0, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, 0, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v8, s2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s6, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s7, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v10, s3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
+; GFX10-NEXT:    v_or_b32_e32 v0, v11, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, v15, v1
+; GFX10-NEXT:    v_or_b32_e32 v2, v19, v2
+; GFX10-NEXT:    v_or_b32_e32 v3, v6, v3
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
+  %cast.result = bitcast i128 %result to <4 x float>
+  ret <4 x float> %cast.result
+}
+
+define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) {
+; GFX6-LABEL: v_fshl_i128_svs:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_movk_i32 s6, 0x7f
+; GFX6-NEXT:    s_mov_b32 s7, 0
+; GFX6-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
+; GFX6-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX6-NEXT:    s_sub_i32 s5, s8, 64
+; GFX6-NEXT:    s_sub_i32 s9, 64, s8
+; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[0:1], s8
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[0:1], s9
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GFX6-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
+; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX6-NEXT:    s_sub_i32 s1, 64, 1
+; GFX6-NEXT:    s_sub_i32 s0, 1, 64
+; GFX6-NEXT:    s_cmp_lt_u32 1, 64
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 1, 0
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], 1
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s1
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], 1
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s0
+; GFX6-NEXT:    s_and_b32 s0, 1, s5
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX6-NEXT:    s_and_b32 s0, 1, s8
+; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
+; GFX6-NEXT:    s_sub_i32 s0, s4, 64
+; GFX6-NEXT:    s_sub_i32 s1, 64, s4
+; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s4
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s1
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s4
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s0
+; GFX6-NEXT:    s_and_b32 s0, 1, s5
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX6-NEXT:    s_and_b32 s0, 1, s8
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX6-NEXT:    s_and_b32 s0, 1, s5
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX6-NEXT:    v_or_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, s7, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshl_i128_svs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_movk_i32 s6, 0x7f
+; GFX8-NEXT:    s_mov_b32 s7, 0
+; GFX8-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
+; GFX8-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX8-NEXT:    s_sub_i32 s5, s8, 64
+; GFX8-NEXT:    s_sub_i32 s9, 64, s8
+; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[0:1], s8
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[0:1], s9
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GFX8-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
+; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX8-NEXT:    s_sub_i32 s1, 64, 1
+; GFX8-NEXT:    s_sub_i32 s0, 1, 64
+; GFX8-NEXT:    s_cmp_lt_u32 1, 64
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 1, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], 1, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], 1, v[2:3]
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
+; GFX8-NEXT:    s_and_b32 s0, 1, s5
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX8-NEXT:    s_and_b32 s0, 1, s8
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
+; GFX8-NEXT:    s_sub_i32 s0, s4, 64
+; GFX8-NEXT:    s_sub_i32 s1, 64, s4
+; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s4, v[2:3]
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
+; GFX8-NEXT:    s_and_b32 s0, 1, s5
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8-NEXT:    s_and_b32 s0, 1, s8
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX8-NEXT:    s_and_b32 s0, 1, s5
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX8-NEXT:    v_or_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, s7, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshl_i128_svs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_movk_i32 s6, 0x7f
+; GFX9-NEXT:    s_mov_b32 s7, 0
+; GFX9-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
+; GFX9-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX9-NEXT:    s_sub_i32 s5, s8, 64
+; GFX9-NEXT:    s_sub_i32 s9, 64, s8
+; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[0:1], s8
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[0:1], s9
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GFX9-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
+; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    s_sub_i32 s1, 64, 1
+; GFX9-NEXT:    s_sub_i32 s0, 1, 64
+; GFX9-NEXT:    s_cmp_lt_u32 1, 64
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 1, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], 1, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], 1, v[2:3]
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
+; GFX9-NEXT:    s_and_b32 s0, 1, s5
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX9-NEXT:    s_and_b32 s0, 1, s8
+; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
+; GFX9-NEXT:    s_sub_i32 s0, s4, 64
+; GFX9-NEXT:    s_sub_i32 s1, 64, s4
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s4, v[2:3]
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
+; GFX9-NEXT:    s_and_b32 s0, 1, s5
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9-NEXT:    s_and_b32 s0, 1, s8
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX9-NEXT:    s_and_b32 s0, 1, s5
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX9-NEXT:    v_or_b32_e32 v0, s6, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, s7, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshl_i128_svs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_movk_i32 s6, 0x7f
+; GFX10-NEXT:    s_mov_b32 s7, 0
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], 1, v[0:1]
+; GFX10-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
+; GFX10-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-NEXT:    s_sub_i32 s5, s8, 64
+; GFX10-NEXT:    s_sub_i32 s6, 64, s8
+; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s8
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
+; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX10-NEXT:    s_sub_i32 s0, 64, 1
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
+; GFX10-NEXT:    s_sub_i32 s0, 1, 64
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    v_lshrrev_b64 v[11:12], s0, v[2:3]
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10-NEXT:    s_and_b32 s1, 1, vcc_lo
+; GFX10-NEXT:    s_and_b32 s0, 1, s0
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v12, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
+; GFX10-NEXT:    s_sub_i32 s0, 64, s4
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
+; GFX10-NEXT:    s_sub_i32 s0, s4, 64
+; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX10-NEXT:    v_lshrrev_b64 v[11:12], s0, v[2:3]
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10-NEXT:    s_and_b32 s1, 1, vcc_lo
+; GFX10-NEXT:    s_and_b32 s0, 1, s0
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s4, v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v12, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
+; GFX10-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, s9, v1
+; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
+  %cast.result = bitcast i128 %result to <4 x float>
+  ret <4 x float> %cast.result
+}
+
+define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
+; GFX6-LABEL: v_fshl_i128_vss:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_movk_i32 s6, 0x7f
+; GFX6-NEXT:    s_mov_b32 s7, 0
+; GFX6-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
+; GFX6-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX6-NEXT:    s_sub_i32 s6, 64, s8
+; GFX6-NEXT:    s_sub_i32 s5, s8, 64
+; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s6
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s8
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    v_lshl_b64 v[8:9], v[0:1], s8
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s5
+; GFX6-NEXT:    s_and_b32 s5, 1, s7
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX6-NEXT:    s_and_b32 s5, 1, s9
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX6-NEXT:    s_sub_i32 s5, 1, 64
+; GFX6-NEXT:    s_sub_i32 s10, 64, 1
+; GFX6-NEXT:    s_cmp_lt_u32 1, 64
+; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 1, 0
+; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[0:1], 1
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[2:3], 1
+; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s5
+; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[8:9], s[2:3]
+; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX6-NEXT:    s_sub_i32 s10, s4, 64
+; GFX6-NEXT:    s_sub_i32 s8, 64, s4
+; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
+; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
+; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX6-NEXT:    v_or_b32_e32 v0, s0, v6
+; GFX6-NEXT:    v_or_b32_e32 v1, s1, v7
+; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshl_i128_vss:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_movk_i32 s6, 0x7f
+; GFX8-NEXT:    s_mov_b32 s7, 0
+; GFX8-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
+; GFX8-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX8-NEXT:    s_sub_i32 s6, 64, s8
+; GFX8-NEXT:    s_sub_i32 s5, s8, 64
+; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s8, v[2:3]
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s8, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
+; GFX8-NEXT:    s_and_b32 s5, 1, s7
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8-NEXT:    s_and_b32 s5, 1, s9
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX8-NEXT:    s_sub_i32 s5, 1, 64
+; GFX8-NEXT:    s_sub_i32 s10, 64, 1
+; GFX8-NEXT:    s_cmp_lt_u32 1, 64
+; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 1, 0
+; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[0:1], 1
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[2:3], 1
+; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s5
+; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[8:9], s[2:3]
+; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX8-NEXT:    s_sub_i32 s10, s4, 64
+; GFX8-NEXT:    s_sub_i32 s8, 64, s4
+; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
+; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
+; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v6
+; GFX8-NEXT:    v_or_b32_e32 v1, s1, v7
+; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshl_i128_vss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_movk_i32 s6, 0x7f
+; GFX9-NEXT:    s_mov_b32 s7, 0
+; GFX9-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
+; GFX9-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX9-NEXT:    s_sub_i32 s6, 64, s8
+; GFX9-NEXT:    s_sub_i32 s5, s8, 64
+; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s8, v[2:3]
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s8, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
+; GFX9-NEXT:    s_and_b32 s5, 1, s7
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9-NEXT:    s_and_b32 s5, 1, s9
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX9-NEXT:    s_sub_i32 s5, 1, 64
+; GFX9-NEXT:    s_sub_i32 s10, 64, 1
+; GFX9-NEXT:    s_cmp_lt_u32 1, 64
+; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 1, 0
+; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[0:1], 1
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[2:3], 1
+; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s5
+; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[8:9], s[2:3]
+; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX9-NEXT:    s_sub_i32 s10, s4, 64
+; GFX9-NEXT:    s_sub_i32 s8, 64, s4
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
+; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v6
+; GFX9-NEXT:    v_or_b32_e32 v1, s1, v7
+; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshl_i128_vss:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_movk_i32 s6, 0x7f
+; GFX10-NEXT:    s_mov_b32 s7, 0
+; GFX10-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
+; GFX10-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-NEXT:    s_sub_i32 s9, 64, s8
+; GFX10-NEXT:    s_sub_i32 s5, s8, 64
+; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s9, v[0:1]
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s8, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s8, v[0:1]
+; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[11:12], s5, v[0:1]
+; GFX10-NEXT:    s_and_b32 s5, 1, vcc_lo
+; GFX10-NEXT:    s_and_b32 s12, 1, s6
+; GFX10-NEXT:    s_sub_i32 s13, 1, 64
+; GFX10-NEXT:    s_sub_i32 s8, 64, 1
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], 1
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[2:3], 1
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s13
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v11, v4, vcc_lo
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v5, vcc_lo
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s12
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[10:11], 0
+; GFX10-NEXT:    s_sub_i32 s10, s4, 64
+; GFX10-NEXT:    s_sub_i32 s5, 64, s4
+; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
+; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s5
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v6
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
+; GFX10-NEXT:    v_or_b32_e32 v1, s1, v7
+; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
+  %cast.result = bitcast i128 %result to <4 x float>
+  ret <4 x float> %cast.result
+}
+
+define amdgpu_ps i128 @s_fshl_i128_65(i128 inreg %lhs, i128 inreg %rhs) {
+; GFX6-LABEL: s_fshl_i128_65:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_movk_i32 s8, 0x41
+; GFX6-NEXT:    s_sub_i32 s16, s8, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s8
+; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[12:13], s[0:1], s12
+; GFX6-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], s8
+; GFX6-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s16
+; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX6-NEXT:    s_sub_i32 s14, 63, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, 63
+; GFX6-NEXT:    s_cmp_lt_u32 63, 64
+; GFX6-NEXT:    s_mov_b32 s9, 0
+; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 63, 0
+; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX6-NEXT:    s_lshr_b32 s0, s5, 31
+; GFX6-NEXT:    s_mov_b32 s1, s9
+; GFX6-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
+; GFX6-NEXT:    s_lshr_b32 s8, s7, 31
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
+; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
+; GFX6-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_i128_65:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_movk_i32 s8, 0x41
+; GFX8-NEXT:    s_sub_i32 s16, s8, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s8
+; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[12:13], s[0:1], s12
+; GFX8-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], s8
+; GFX8-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s16
+; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX8-NEXT:    s_sub_i32 s14, 63, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, 63
+; GFX8-NEXT:    s_cmp_lt_u32 63, 64
+; GFX8-NEXT:    s_mov_b32 s9, 0
+; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 63, 0
+; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX8-NEXT:    s_lshr_b32 s0, s5, 31
+; GFX8-NEXT:    s_mov_b32 s1, s9
+; GFX8-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
+; GFX8-NEXT:    s_lshr_b32 s8, s7, 31
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
+; GFX8-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_i128_65:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_movk_i32 s8, 0x41
+; GFX9-NEXT:    s_sub_i32 s16, s8, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s8
+; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[12:13], s[0:1], s12
+; GFX9-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], s8
+; GFX9-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s16
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    s_sub_i32 s14, 63, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, 63
+; GFX9-NEXT:    s_cmp_lt_u32 63, 64
+; GFX9-NEXT:    s_mov_b32 s9, 0
+; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 63, 0
+; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX9-NEXT:    s_lshr_b32 s0, s5, 31
+; GFX9-NEXT:    s_mov_b32 s1, s9
+; GFX9-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
+; GFX9-NEXT:    s_lshr_b32 s8, s7, 31
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
+; GFX9-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_i128_65:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_movk_i32 s12, 0x41
+; GFX10-NEXT:    s_sub_i32 s14, s12, 64
+; GFX10-NEXT:    s_sub_i32 s8, 64, s12
+; GFX10-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s12
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
+; GFX10-NEXT:    s_lshl_b64 s[12:13], s[0:1], s12
+; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s14
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_cselect_b64 s[10:11], s[12:13], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX10-NEXT:    s_sub_i32 s14, 63, 64
+; GFX10-NEXT:    s_sub_i32 s0, 64, 63
+; GFX10-NEXT:    s_cmp_lt_u32 63, 64
+; GFX10-NEXT:    s_mov_b32 s1, 0
+; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 63, 0
+; GFX10-NEXT:    s_mov_b32 s9, s1
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_lshl_b64 s[12:13], s[6:7], s0
+; GFX10-NEXT:    s_lshr_b32 s8, s5, 31
+; GFX10-NEXT:    s_lshr_b32 s0, s7, 31
+; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
+; GFX10-NEXT:    s_or_b64 s[0:1], s[10:11], s[4:5]
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65)
+  ret i128 %result
+}
+
+define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
+; GFX6-LABEL: v_fshl_i128_65:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_movk_i32 s4, 0x41
+; GFX6-NEXT:    s_sub_i32 s6, 64, s4
+; GFX6-NEXT:    s_sub_i32 s5, s4, 64
+; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[0:1], s6
+; GFX6-NEXT:    v_lshl_b64 v[10:11], v[2:3], s4
+; GFX6-NEXT:    v_lshl_b64 v[12:13], v[0:1], s4
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s5
+; GFX6-NEXT:    s_and_b32 s4, 1, s7
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX6-NEXT:    s_and_b32 s4, 1, s8
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    s_sub_i32 s4, 63, 64
+; GFX6-NEXT:    s_sub_i32 s5, 64, 63
+; GFX6-NEXT:    s_cmp_lt_u32 63, 64
+; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 63, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[6:7], s5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v8, 31, v7
+; GFX6-NEXT:    v_lshrrev_b32_e32 v9, 31, v5
+; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], s4
+; GFX6-NEXT:    s_and_b32 s4, 1, s6
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX6-NEXT:    s_and_b32 s4, 1, s7
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    s_and_b32 s4, 1, s6
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; GFX6-NEXT:    v_or_b32_e32 v0, v10, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, v11, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_i128_65:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_movk_i32 s4, 0x41
+; GFX8-NEXT:    s_sub_i32 s6, 64, s4
+; GFX8-NEXT:    s_sub_i32 s5, s4, 64
+; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s6, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[10:11], s4, v[2:3]
+; GFX8-NEXT:    v_lshlrev_b64 v[12:13], s4, v[0:1]
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
+; GFX8-NEXT:    s_and_b32 s4, 1, s7
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX8-NEXT:    s_and_b32 s4, 1, s8
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    s_sub_i32 s4, 63, 64
+; GFX8-NEXT:    s_sub_i32 s5, 64, 63
+; GFX8-NEXT:    s_cmp_lt_u32 63, 64
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 63, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s5, v[6:7]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 31, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 31, v5
+; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[6:7], s4, v[6:7]
+; GFX8-NEXT:    s_and_b32 s4, 1, s6
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX8-NEXT:    s_and_b32 s4, 1, s7
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    s_and_b32 s4, 1, s6
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; GFX8-NEXT:    v_or_b32_e32 v0, v10, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v11, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_i128_65:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0x41
+; GFX9-NEXT:    s_sub_i32 s6, 64, s4
+; GFX9-NEXT:    s_sub_i32 s5, s4, 64
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s6, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], s4, v[2:3]
+; GFX9-NEXT:    v_lshlrev_b64 v[12:13], s4, v[0:1]
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
+; GFX9-NEXT:    s_and_b32 s4, 1, s7
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX9-NEXT:    s_and_b32 s4, 1, s8
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    s_sub_i32 s4, 63, 64
+; GFX9-NEXT:    s_sub_i32 s5, 64, 63
+; GFX9-NEXT:    s_cmp_lt_u32 63, 64
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 63, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s5, v[6:7]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 31, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 31, v5
+; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], s4, v[6:7]
+; GFX9-NEXT:    s_and_b32 s4, 1, s6
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX9-NEXT:    s_and_b32 s4, 1, s7
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    s_and_b32 s4, 1, s6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; GFX9-NEXT:    v_or_b32_e32 v0, v10, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, v11, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_i128_65:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_movk_i32 s4, 0x41
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 31, v5
+; GFX10-NEXT:    s_sub_i32 s5, 64, s4
+; GFX10-NEXT:    v_lshlrev_b64 v[10:11], s4, v[2:3]
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s5, v[0:1]
+; GFX10-NEXT:    s_sub_i32 s5, s4, 64
+; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX10-NEXT:    v_lshlrev_b64 v[12:13], s4, v[0:1]
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_sub_i32 s5, 64, 63
+; GFX10-NEXT:    v_or_b32_e32 v15, v9, v11
+; GFX10-NEXT:    v_or_b32_e32 v14, v8, v10
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s5, v[6:7]
+; GFX10-NEXT:    s_and_b32 s6, 1, vcc_lo
+; GFX10-NEXT:    s_and_b32 s7, 1, s4
+; GFX10-NEXT:    s_sub_i32 s4, 63, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v27, 0, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
+; GFX10-NEXT:    v_lshrrev_b64 v[23:24], s4, v[6:7]
+; GFX10-NEXT:    s_cmp_lt_u32 63, 64
+; GFX10-NEXT:    v_or_b32_e32 v6, v19, v8
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 63, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s6, 0, s7
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v23, v6, s4
+; GFX10-NEXT:    s_and_b32 s5, 1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v24, v9, s4
+; GFX10-NEXT:    s_and_b32 s4, 1, s4
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v0, v2, s6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 31, v7
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v1, v15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v10, v3, s6
+; GFX10-NEXT:    v_or_b32_e32 v0, v11, v4
+; GFX10-NEXT:    v_or_b32_e32 v1, v27, v5
+; GFX10-NEXT:    v_or_b32_e32 v2, v19, v6
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65)
+  ret i128 %result
+}
+
+define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
+; GFX6-LABEL: s_fshl_v2i128:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_movk_i32 s18, 0x7f
+; GFX6-NEXT:    s_mov_b32 s19, 0
+; GFX6-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
+; GFX6-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
+; GFX6-NEXT:    s_sub_i32 s17, s22, 64
+; GFX6-NEXT:    s_sub_i32 s23, 64, s22
+; GFX6-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s22, 0
+; GFX6-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX6-NEXT:    s_lshl_b64 s[24:25], s[0:1], s22
+; GFX6-NEXT:    s_lshr_b64 s[26:27], s[0:1], s23
+; GFX6-NEXT:    s_lshl_b64 s[22:23], s[2:3], s22
+; GFX6-NEXT:    s_or_b64 s[22:23], s[26:27], s[22:23]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
+; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX6-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX6-NEXT:    s_sub_i32 s28, 1, 64
+; GFX6-NEXT:    s_sub_i32 s29, 64, 1
+; GFX6-NEXT:    s_cmp_lt_u32 1, 64
+; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 1, 0
+; GFX6-NEXT:    s_cselect_b32 s30, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[22:23], s[8:9], 1
+; GFX6-NEXT:    s_lshl_b64 s[26:27], s[10:11], s29
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[10:11], 1
+; GFX6-NEXT:    s_or_b64 s[22:23], s[22:23], s[26:27]
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s28
+; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[22:23], s[10:11]
+; GFX6-NEXT:    s_cmp_lg_u32 s30, 0
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
+; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX6-NEXT:    s_sub_i32 s26, s16, 64
+; GFX6-NEXT:    s_sub_i32 s22, 64, s16
+; GFX6-NEXT:    s_cmp_lt_u32 s16, 64
+; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX6-NEXT:    s_cselect_b32 s30, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[0:1], s16
+; GFX6-NEXT:    s_lshl_b64 s[22:23], s[0:1], s22
+; GFX6-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
+; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], s26
+; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[16:17], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s30, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
+; GFX6-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX6-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
+; GFX6-NEXT:    s_sub_i32 s11, s8, 64
+; GFX6-NEXT:    s_sub_i32 s9, 64, s8
+; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
+; GFX6-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
+; GFX6-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
+; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
+; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX6-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
+; GFX6-NEXT:    s_cmp_lt_u32 1, 64
+; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 1, 0
+; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[12:13], 1
+; GFX6-NEXT:    s_lshl_b64 s[18:19], s[14:15], s29
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[14:15], 1
+; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[18:19]
+; GFX6-NEXT:    s_lshr_b64 s[14:15], s[14:15], s28
+; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[14:15]
+; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
+; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
+; GFX6-NEXT:    s_sub_i32 s18, s10, 64
+; GFX6-NEXT:    s_sub_i32 s14, 64, s10
+; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[12:13], s[4:5], s10
+; GFX6-NEXT:    s_lshl_b64 s[14:15], s[4:5], s14
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
+; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s18
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[12:13], 0
+; GFX6-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
+; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_v2i128:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_movk_i32 s18, 0x7f
+; GFX8-NEXT:    s_mov_b32 s19, 0
+; GFX8-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
+; GFX8-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
+; GFX8-NEXT:    s_sub_i32 s17, s22, 64
+; GFX8-NEXT:    s_sub_i32 s23, 64, s22
+; GFX8-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s22, 0
+; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX8-NEXT:    s_lshl_b64 s[24:25], s[0:1], s22
+; GFX8-NEXT:    s_lshr_b64 s[26:27], s[0:1], s23
+; GFX8-NEXT:    s_lshl_b64 s[22:23], s[2:3], s22
+; GFX8-NEXT:    s_or_b64 s[22:23], s[26:27], s[22:23]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
+; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX8-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX8-NEXT:    s_sub_i32 s28, 1, 64
+; GFX8-NEXT:    s_sub_i32 s29, 64, 1
+; GFX8-NEXT:    s_cmp_lt_u32 1, 64
+; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 1, 0
+; GFX8-NEXT:    s_cselect_b32 s30, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[22:23], s[8:9], 1
+; GFX8-NEXT:    s_lshl_b64 s[26:27], s[10:11], s29
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[10:11], 1
+; GFX8-NEXT:    s_or_b64 s[22:23], s[22:23], s[26:27]
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s28
+; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[22:23], s[10:11]
+; GFX8-NEXT:    s_cmp_lg_u32 s30, 0
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
+; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX8-NEXT:    s_sub_i32 s26, s16, 64
+; GFX8-NEXT:    s_sub_i32 s22, 64, s16
+; GFX8-NEXT:    s_cmp_lt_u32 s16, 64
+; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX8-NEXT:    s_cselect_b32 s30, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[0:1], s16
+; GFX8-NEXT:    s_lshl_b64 s[22:23], s[0:1], s22
+; GFX8-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
+; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], s26
+; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[16:17], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s30, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
+; GFX8-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX8-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
+; GFX8-NEXT:    s_sub_i32 s11, s8, 64
+; GFX8-NEXT:    s_sub_i32 s9, 64, s8
+; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
+; GFX8-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
+; GFX8-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
+; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX8-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
+; GFX8-NEXT:    s_cmp_lt_u32 1, 64
+; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 1, 0
+; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[12:13], 1
+; GFX8-NEXT:    s_lshl_b64 s[18:19], s[14:15], s29
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[14:15], 1
+; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[18:19]
+; GFX8-NEXT:    s_lshr_b64 s[14:15], s[14:15], s28
+; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[14:15]
+; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
+; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
+; GFX8-NEXT:    s_sub_i32 s18, s10, 64
+; GFX8-NEXT:    s_sub_i32 s14, 64, s10
+; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[12:13], s[4:5], s10
+; GFX8-NEXT:    s_lshl_b64 s[14:15], s[4:5], s14
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
+; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s18
+; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[12:13], 0
+; GFX8-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
+; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_v2i128:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_movk_i32 s18, 0x7f
+; GFX9-NEXT:    s_mov_b32 s19, 0
+; GFX9-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
+; GFX9-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
+; GFX9-NEXT:    s_sub_i32 s17, s22, 64
+; GFX9-NEXT:    s_sub_i32 s23, 64, s22
+; GFX9-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s22, 0
+; GFX9-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX9-NEXT:    s_lshl_b64 s[24:25], s[0:1], s22
+; GFX9-NEXT:    s_lshr_b64 s[26:27], s[0:1], s23
+; GFX9-NEXT:    s_lshl_b64 s[22:23], s[2:3], s22
+; GFX9-NEXT:    s_or_b64 s[22:23], s[26:27], s[22:23]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
+; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX9-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    s_sub_i32 s28, 1, 64
+; GFX9-NEXT:    s_sub_i32 s29, 64, 1
+; GFX9-NEXT:    s_cmp_lt_u32 1, 64
+; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 1, 0
+; GFX9-NEXT:    s_cselect_b32 s30, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[22:23], s[8:9], 1
+; GFX9-NEXT:    s_lshl_b64 s[26:27], s[10:11], s29
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[10:11], 1
+; GFX9-NEXT:    s_or_b64 s[22:23], s[22:23], s[26:27]
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s28
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[22:23], s[10:11]
+; GFX9-NEXT:    s_cmp_lg_u32 s30, 0
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX9-NEXT:    s_sub_i32 s26, s16, 64
+; GFX9-NEXT:    s_sub_i32 s22, 64, s16
+; GFX9-NEXT:    s_cmp_lt_u32 s16, 64
+; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX9-NEXT:    s_cselect_b32 s30, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[0:1], s16
+; GFX9-NEXT:    s_lshl_b64 s[22:23], s[0:1], s22
+; GFX9-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
+; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s26
+; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[16:17], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s30, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
+; GFX9-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX9-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
+; GFX9-NEXT:    s_sub_i32 s11, s8, 64
+; GFX9-NEXT:    s_sub_i32 s9, 64, s8
+; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
+; GFX9-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
+; GFX9-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
+; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX9-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
+; GFX9-NEXT:    s_cmp_lt_u32 1, 64
+; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 1, 0
+; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[12:13], 1
+; GFX9-NEXT:    s_lshl_b64 s[18:19], s[14:15], s29
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[14:15], 1
+; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[18:19]
+; GFX9-NEXT:    s_lshr_b64 s[14:15], s[14:15], s28
+; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[14:15]
+; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
+; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
+; GFX9-NEXT:    s_sub_i32 s18, s10, 64
+; GFX9-NEXT:    s_sub_i32 s14, 64, s10
+; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[12:13], s[4:5], s10
+; GFX9-NEXT:    s_lshl_b64 s[14:15], s[4:5], s14
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
+; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s18
+; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[12:13], 0
+; GFX9-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_v2i128:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_movk_i32 s18, 0x7f
+; GFX10-NEXT:    s_mov_b32 s19, 0
+; GFX10-NEXT:    s_mov_b32 s30, s0
+; GFX10-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
+; GFX10-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
+; GFX10-NEXT:    s_sub_i32 s17, s22, 64
+; GFX10-NEXT:    s_sub_i32 s23, 64, s22
+; GFX10-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX10-NEXT:    s_mov_b32 s31, s1
+; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s22, 0
+; GFX10-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[24:25], s[30:31], s23
+; GFX10-NEXT:    s_lshl_b64 s[26:27], s[2:3], s22
+; GFX10-NEXT:    s_lshl_b64 s[22:23], s[30:31], s22
+; GFX10-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[30:31], s17
+; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX10-NEXT:    s_cselect_b64 s[22:23], s[22:23], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX10-NEXT:    s_sub_i32 s28, 1, 64
+; GFX10-NEXT:    s_sub_i32 s29, 64, 1
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    s_cselect_b32 s30, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
+; GFX10-NEXT:    s_lshl_b64 s[24:25], s[10:11], s29
+; GFX10-NEXT:    s_lshr_b64 s[26:27], s[10:11], 1
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s28
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
+; GFX10-NEXT:    s_cmp_lg_u32 s30, 0
+; GFX10-NEXT:    s_cselect_b64 s[46:47], s[8:9], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[26:27], 0
+; GFX10-NEXT:    s_sub_i32 s26, s16, 64
+; GFX10-NEXT:    s_sub_i32 s17, 64, s16
+; GFX10-NEXT:    s_cmp_lt_u32 s16, 64
+; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b32 s30, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[46:47], s16
+; GFX10-NEXT:    s_lshl_b64 s[24:25], s[8:9], s17
+; GFX10-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
+; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[24:25]
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s26
+; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
+; GFX10-NEXT:    s_cmp_lg_u32 s30, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[46:47], s[8:9]
+; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
+; GFX10-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
+; GFX10-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT:    s_sub_i32 s11, s8, 64
+; GFX10-NEXT:    s_sub_i32 s9, 64, s8
+; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[16:17], s[4:5], s9
+; GFX10-NEXT:    s_lshl_b64 s[18:19], s[6:7], s8
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
+; GFX10-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
+; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[14:15], s29
+; GFX10-NEXT:    s_lshr_b64 s[18:19], s[14:15], 1
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[16:17]
+; GFX10-NEXT:    s_lshr_b64 s[14:15], s[14:15], s28
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[14:15]
+; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_cselect_b64 s[12:13], s[18:19], 0
+; GFX10-NEXT:    s_sub_i32 s18, s10, 64
+; GFX10-NEXT:    s_sub_i32 s11, 64, s10
+; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[14:15], s[4:5], s10
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[12:13], s11
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[12:13], s10
+; GFX10-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX10-NEXT:    s_lshr_b64 s[12:13], s[12:13], s18
+; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX10-NEXT:    s_cselect_b64 s[12:13], s[14:15], s[12:13]
+; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[12:13]
+; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX10-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
+; GFX10-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
+  ret <2 x i128> %result
+}
+
+define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) {
+; GFX6-LABEL: v_fshl_v2i128:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_movk_i32 s6, 0x7f
+; GFX6-NEXT:    v_and_b32_e32 v23, s6, v16
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, 64, v23
+; GFX6-NEXT:    v_lshr_b64 v[17:18], v[0:1], v17
+; GFX6-NEXT:    v_lshl_b64 v[21:22], v[2:3], v23
+; GFX6-NEXT:    s_sub_i32 s7, 64, 1
+; GFX6-NEXT:    s_sub_i32 s8, 1, 64
+; GFX6-NEXT:    s_cmp_lt_u32 1, 64
+; GFX6-NEXT:    v_or_b32_e32 v24, v17, v21
+; GFX6-NEXT:    v_or_b32_e32 v25, v18, v22
+; GFX6-NEXT:    v_lshr_b64 v[17:18], v[8:9], 1
+; GFX6-NEXT:    v_lshl_b64 v[21:22], v[10:11], s7
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 1, 0
+; GFX6-NEXT:    v_or_b32_e32 v19, v17, v21
+; GFX6-NEXT:    v_or_b32_e32 v21, v18, v22
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, 1, s4
+; GFX6-NEXT:    v_lshr_b64 v[17:18], v[10:11], s8
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    s_and_b32 s4, 1, s5
+; GFX6-NEXT:    v_lshr_b64 v[10:11], v[10:11], 1
+; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v16
+; GFX6-NEXT:    v_cndmask_b32_e32 v18, v18, v21, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
+; GFX6-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc
+; GFX6-NEXT:    v_and_b32_e32 v21, s6, v16
+; GFX6-NEXT:    v_cndmask_b32_e64 v8, v17, v8, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v9, v18, v9, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v11, 0, v11, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 64, v21
+; GFX6-NEXT:    v_lshl_b64 v[16:17], v[10:11], v16
+; GFX6-NEXT:    v_lshr_b64 v[18:19], v[8:9], v21
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
+; GFX6-NEXT:    v_or_b32_e32 v18, v18, v16
+; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, 64, v23
+; GFX6-NEXT:    v_or_b32_e32 v19, v19, v17
+; GFX6-NEXT:    v_lshl_b64 v[16:17], v[0:1], v16
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v23
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
+; GFX6-NEXT:    v_cndmask_b32_e32 v22, 0, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v16, v24, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v16, v17, v25, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v17, v0, v2, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v16, v16, v3, s[4:5]
+; GFX6-NEXT:    v_subrev_i32_e64 v0, s[4:5], 64, v21
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[10:11], v0
+; GFX6-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v21
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v18, 0, v1, vcc
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[10:11], v21
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v21
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, v1, s[4:5]
+; GFX6-NEXT:    v_or_b32_e32 v0, v22, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v18, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v17, v8
+; GFX6-NEXT:    v_or_b32_e32 v3, v16, v9
+; GFX6-NEXT:    v_and_b32_e32 v16, s6, v20
+; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v20
+; GFX6-NEXT:    v_and_b32_e32 v17, s6, v8
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v16
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], v8
+; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v16
+; GFX6-NEXT:    v_subrev_i32_e32 v18, vcc, 64, v16
+; GFX6-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX6-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], v16
+; GFX6-NEXT:    v_lshl_b64 v[4:5], v[4:5], v18
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX6-NEXT:    v_cndmask_b32_e32 v18, 0, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v19, 0, v9, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
+; GFX6-NEXT:    s_cmp_lt_u32 1, 64
+; GFX6-NEXT:    v_cndmask_b32_e32 v16, v4, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v20, v5, v7, vcc
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[12:13], 1
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[14:15], s7
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 1, 0
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_and_b32 s6, 1, s4
+; GFX6-NEXT:    v_or_b32_e32 v6, v4, v6
+; GFX6-NEXT:    v_or_b32_e32 v7, v5, v7
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[14:15], s8
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s6
+; GFX6-NEXT:    s_and_b32 s5, 1, s5
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[14:15], 1
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX6-NEXT:    s_and_b32 s4, 1, s4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 64, v17
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], v17
+; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v10
+; GFX6-NEXT:    v_subrev_i32_e32 v12, vcc, 64, v17
+; GFX6-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX6-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[6:7], v17
+; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v12
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
+; GFX6-NEXT:    v_or_b32_e32 v4, v18, v4
+; GFX6-NEXT:    v_or_b32_e32 v5, v19, v5
+; GFX6-NEXT:    v_or_b32_e32 v6, v16, v6
+; GFX6-NEXT:    v_or_b32_e32 v7, v20, v7
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_v2i128:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_movk_i32 s6, 0x7f
+; GFX8-NEXT:    v_and_b32_e32 v23, s6, v16
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, 64, v23
+; GFX8-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v23, v[2:3]
+; GFX8-NEXT:    s_sub_i32 s7, 64, 1
+; GFX8-NEXT:    s_sub_i32 s8, 1, 64
+; GFX8-NEXT:    s_cmp_lt_u32 1, 64
+; GFX8-NEXT:    v_or_b32_e32 v24, v17, v21
+; GFX8-NEXT:    v_or_b32_e32 v25, v18, v22
+; GFX8-NEXT:    v_lshrrev_b64 v[17:18], 1, v[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[21:22], s7, v[10:11]
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 1, 0
+; GFX8-NEXT:    v_or_b32_e32 v19, v17, v21
+; GFX8-NEXT:    v_or_b32_e32 v21, v18, v22
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, 1, s4
+; GFX8-NEXT:    v_lshrrev_b64 v[17:18], s8, v[10:11]
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    s_and_b32 s4, 1, s5
+; GFX8-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
+; GFX8-NEXT:    v_xor_b32_e32 v16, -1, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v21, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc
+; GFX8-NEXT:    v_and_b32_e32 v21, s6, v16
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v17, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v18, v9, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, 0, v11, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, 64, v21
+; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v16, v[10:11]
+; GFX8-NEXT:    v_lshrrev_b64 v[18:19], v21, v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
+; GFX8-NEXT:    v_or_b32_e32 v18, v18, v16
+; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, 64, v23
+; GFX8-NEXT:    v_or_b32_e32 v19, v19, v17
+; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v16, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v23, v[0:1]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, 0, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v24, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v25, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v0, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v16, v3, s[4:5]
+; GFX8-NEXT:    v_subrev_u32_e64 v0, s[4:5], 64, v21
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v0, v[10:11]
+; GFX8-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v21
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, 0, v1, vcc
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v21, v[10:11]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v21
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, v1, s[4:5]
+; GFX8-NEXT:    v_or_b32_e32 v0, v22, v2
+; GFX8-NEXT:    v_or_b32_e32 v1, v18, v3
+; GFX8-NEXT:    v_or_b32_e32 v2, v17, v8
+; GFX8-NEXT:    v_or_b32_e32 v3, v16, v9
+; GFX8-NEXT:    v_and_b32_e32 v16, s6, v20
+; GFX8-NEXT:    v_xor_b32_e32 v8, -1, v20
+; GFX8-NEXT:    v_and_b32_e32 v17, s6, v8
+; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v16
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v8, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v16, v[6:7]
+; GFX8-NEXT:    v_subrev_u32_e32 v18, vcc, 64, v16
+; GFX8-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX8-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX8-NEXT:    v_lshlrev_b64 v[8:9], v16, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v18, v[4:5]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, 0, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, 0, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
+; GFX8-NEXT:    s_cmp_lt_u32 1, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v4, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v5, v7, vcc
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], 1, v[12:13]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s7, v[14:15]
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 1, 0
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_and_b32 s6, 1, s4
+; GFX8-NEXT:    v_or_b32_e32 v6, v4, v6
+; GFX8-NEXT:    v_or_b32_e32 v7, v5, v7
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s8, v[14:15]
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s6
+; GFX8-NEXT:    s_and_b32 s5, 1, s5
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], 1, v[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX8-NEXT:    s_and_b32 s4, 1, s4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 64, v17
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v17, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v10, v[6:7]
+; GFX8-NEXT:    v_subrev_u32_e32 v12, vcc, 64, v17
+; GFX8-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX8-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v17, v[6:7]
+; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v12, v[6:7]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
+; GFX8-NEXT:    v_or_b32_e32 v4, v18, v4
+; GFX8-NEXT:    v_or_b32_e32 v5, v19, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, v16, v6
+; GFX8-NEXT:    v_or_b32_e32 v7, v20, v7
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_v2i128:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s6, 0x7f
+; GFX9-NEXT:    v_and_b32_e32 v23, s6, v16
+; GFX9-NEXT:    v_sub_u32_e32 v17, 64, v23
+; GFX9-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v23, v[2:3]
+; GFX9-NEXT:    s_sub_i32 s7, 64, 1
+; GFX9-NEXT:    s_sub_i32 s8, 1, 64
+; GFX9-NEXT:    s_cmp_lt_u32 1, 64
+; GFX9-NEXT:    v_or_b32_e32 v24, v17, v21
+; GFX9-NEXT:    v_or_b32_e32 v25, v18, v22
+; GFX9-NEXT:    v_lshrrev_b64 v[17:18], 1, v[8:9]
+; GFX9-NEXT:    v_lshlrev_b64 v[21:22], s7, v[10:11]
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 1, 0
+; GFX9-NEXT:    v_or_b32_e32 v19, v17, v21
+; GFX9-NEXT:    v_or_b32_e32 v21, v18, v22
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, 1, s4
+; GFX9-NEXT:    v_lshrrev_b64 v[17:18], s8, v[10:11]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    s_and_b32 s4, 1, s5
+; GFX9-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
+; GFX9-NEXT:    v_xor_b32_e32 v16, -1, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v18, v21, vcc
+; GFX9-NEXT:    v_and_b32_e32 v21, s6, v16
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v17, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v18, v9, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, 0, v11, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v16, 64, v21
+; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v16, v[10:11]
+; GFX9-NEXT:    v_lshrrev_b64 v[18:19], v21, v[8:9]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
+; GFX9-NEXT:    v_or_b32_e32 v18, v18, v16
+; GFX9-NEXT:    v_subrev_u32_e32 v16, 64, v23
+; GFX9-NEXT:    v_or_b32_e32 v19, v19, v17
+; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v16, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v23, v[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, 0, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v24, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v25, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v0, v2, s[4:5]
+; GFX9-NEXT:    v_subrev_u32_e32 v0, 64, v21
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v3, s[4:5]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v0, v[10:11]
+; GFX9-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v21
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v21, v[10:11]
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v21
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, v1, s[4:5]
+; GFX9-NEXT:    v_or_b32_e32 v0, v22, v2
+; GFX9-NEXT:    v_or_b32_e32 v1, v18, v3
+; GFX9-NEXT:    v_or_b32_e32 v2, v17, v8
+; GFX9-NEXT:    v_or_b32_e32 v3, v16, v9
+; GFX9-NEXT:    v_and_b32_e32 v16, s6, v20
+; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v20
+; GFX9-NEXT:    v_and_b32_e32 v17, s6, v8
+; GFX9-NEXT:    v_sub_u32_e32 v8, 64, v16
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v8, v[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v16, v[6:7]
+; GFX9-NEXT:    v_subrev_u32_e32 v18, 64, v16
+; GFX9-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX9-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX9-NEXT:    v_lshlrev_b64 v[8:9], v16, v[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v18, v[4:5]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, 0, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
+; GFX9-NEXT:    s_cmp_lt_u32 1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v4, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v20, v5, v7, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], 1, v[12:13]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s7, v[14:15]
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 1, 0
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_and_b32 s6, 1, s4
+; GFX9-NEXT:    v_or_b32_e32 v6, v4, v6
+; GFX9-NEXT:    v_or_b32_e32 v7, v5, v7
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s8, v[14:15]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s6
+; GFX9-NEXT:    s_and_b32 s5, 1, s5
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], 1, v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX9-NEXT:    s_and_b32 s4, 1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v17
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v17, v[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v10, v[6:7]
+; GFX9-NEXT:    v_subrev_u32_e32 v12, 64, v17
+; GFX9-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX9-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v17, v[6:7]
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v12, v[6:7]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
+; GFX9-NEXT:    v_or_b32_e32 v4, v18, v4
+; GFX9-NEXT:    v_or_b32_e32 v5, v19, v5
+; GFX9-NEXT:    v_or_b32_e32 v6, v16, v6
+; GFX9-NEXT:    v_or_b32_e32 v7, v20, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_v2i128:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_xor_b32_e32 v17, -1, v16
+; GFX10-NEXT:    s_movk_i32 s7, 0x7f
+; GFX10-NEXT:    s_sub_i32 s8, 64, 1
+; GFX10-NEXT:    v_and_b32_e32 v27, s7, v16
+; GFX10-NEXT:    v_lshlrev_b64 v[18:19], s8, v[10:11]
+; GFX10-NEXT:    v_and_b32_e32 v28, s7, v17
+; GFX10-NEXT:    v_lshrrev_b64 v[16:17], 1, v[8:9]
+; GFX10-NEXT:    s_sub_i32 s9, 1, 64
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    v_lshrrev_b64 v[21:22], s9, v[10:11]
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    v_or_b32_e32 v16, v16, v18
+; GFX10-NEXT:    v_or_b32_e32 v17, v17, v19
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_and_b32 s5, 1, vcc_lo
+; GFX10-NEXT:    s_and_b32 s4, 1, s4
+; GFX10-NEXT:    v_mov_b32_e32 v29, v2
+; GFX10-NEXT:    v_mov_b32_e32 v30, v3
+; GFX10-NEXT:    v_sub_nc_u32_e32 v23, 64, v27
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v16, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s5
+; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v23, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[18:19], v27, v[29:30]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v25, 64, v28
+; GFX10-NEXT:    v_cndmask_b32_e32 v34, v21, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v35, v22, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, v10, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, v11, s4
+; GFX10-NEXT:    v_or_b32_e32 v18, v16, v18
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v31, 64, v27
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v16, 64, v28
+; GFX10-NEXT:    v_lshrrev_b64 v[23:24], v28, v[34:35]
+; GFX10-NEXT:    v_lshlrev_b64 v[25:26], v25, v[10:11]
+; GFX10-NEXT:    v_lshlrev_b64 v[21:22], v27, v[0:1]
+; GFX10-NEXT:    v_or_b32_e32 v19, v17, v19
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v31, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v16, v[10:11]
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v27
+; GFX10-NEXT:    v_or_b32_e32 v23, v23, v25
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v28
+; GFX10-NEXT:    v_or_b32_e32 v24, v24, v26
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v28
+; GFX10-NEXT:    v_cndmask_b32_e32 v18, v0, v18, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v1, v19, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v28, v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v23, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v17, v24, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v22, vcc_lo
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v16, v34, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v10, v35, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, 0, v1, s4
+; GFX10-NEXT:    v_xor_b32_e32 v16, -1, v20
+; GFX10-NEXT:    v_or_b32_e32 v0, v21, v8
+; GFX10-NEXT:    v_or_b32_e32 v1, v11, v9
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[12:13]
+; GFX10-NEXT:    v_lshlrev_b64 v[10:11], s8, v[14:15]
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v27
+; GFX10-NEXT:    v_and_b32_e32 v27, s7, v16
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    v_lshrrev_b64 v[16:17], s9, v[14:15]
+; GFX10-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX10-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_and_b32 s5, 1, vcc_lo
+; GFX10-NEXT:    s_and_b32 s4, 1, s4
+; GFX10-NEXT:    v_and_b32_e32 v24, s7, v20
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[14:15]
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v19, v30, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v16, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v31, v17, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v29, s6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v18, 64, v24
+; GFX10-NEXT:    v_lshlrev_b64 v[14:15], v24, v[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v31, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v19, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, v9, s4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v31, 64, v27
+; GFX10-NEXT:    v_lshrrev_b64 v[35:36], v18, v[4:5]
+; GFX10-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 64, v24
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v27
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v24, v[4:5]
+; GFX10-NEXT:    v_lshrrev_b64 v[18:19], v27, v[12:13]
+; GFX10-NEXT:    v_lshlrev_b64 v[20:21], v31, v[8:9]
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v24
+; GFX10-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
+; GFX10-NEXT:    v_or_b32_e32 v5, v36, v15
+; GFX10-NEXT:    v_or_b32_e32 v14, v35, v14
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v10, v[8:9]
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, 0, v16, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v16, v18, v20
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v27
+; GFX10-NEXT:    v_or_b32_e32 v18, v19, v21
+; GFX10-NEXT:    v_cndmask_b32_e32 v31, v3, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[3:4], v27, v[8:9]
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v24
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v11, v18, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v27
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v16, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, 0, v17, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v5, v7, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, v4, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v31, v6, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v10, v12, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v13, s5
+; GFX10-NEXT:    v_or_b32_e32 v3, v22, v23
+; GFX10-NEXT:    v_or_b32_e32 v7, v14, v11
+; GFX10-NEXT:    v_or_b32_e32 v4, v15, v5
+; GFX10-NEXT:    v_or_b32_e32 v6, v19, v10
+; GFX10-NEXT:    v_or_b32_e32 v5, v9, v8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
+  ret <2 x i128> %result
+}
+
+declare i7 @llvm.fshl.i7(i7, i7, i7) #0
+declare i8 @llvm.fshl.i8(i8, i8, i8) #0
+declare <2 x i8> @llvm.fshl.v2i8(<2 x i8>, <2 x i8>, <2 x i8>) #0
+declare <4 x i8> @llvm.fshl.v4i8(<4 x i8>, <4 x i8>, <4 x i8>) #0
+
+declare i16 @llvm.fshl.i16(i16, i16, i16) #0
+declare <2 x i16> @llvm.fshl.v2i16(<2 x i16>, <2 x i16>, <2 x i16>) #0
+declare <3 x i16> @llvm.fshl.v3i16(<3 x i16>, <3 x i16>, <3 x i16>) #0
+declare <4 x i16> @llvm.fshl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) #0
+declare <5 x i16> @llvm.fshl.v5i16(<5 x i16>, <5 x i16>, <5 x i16>) #0
+declare <6 x i16> @llvm.fshl.v6i16(<6 x i16>, <6 x i16>, <6 x i16>) #0
+declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) #0
+
+declare i24 @llvm.fshl.i24(i24, i24, i24) #0
+declare <2 x i24> @llvm.fshl.v2i24(<2 x i24>, <2 x i24>, <2 x i24>) #0
+
+declare i32 @llvm.fshl.i32(i32, i32, i32) #0
+declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) #0
+declare <3 x i32> @llvm.fshl.v3i32(<3 x i32>, <3 x i32>, <3 x i32>) #0
+declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #0
+declare <5 x i32> @llvm.fshl.v5i32(<5 x i32>, <5 x i32>, <5 x i32>) #0
+declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>) #0
+
+declare i48 @llvm.fshl.i48(i48, i48, i48) #0
+
+declare i64 @llvm.fshl.i64(i64, i64, i64) #0
+declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) #0
+
+declare i128 @llvm.fshl.i128(i128, i128, i128) #0
+declare <2 x i128> @llvm.fshl.v2i128(<2 x i128>, <2 x i128>, <2 x i128>) #0
+
+attributes #0 = { nounwind readnone speculatable willreturn }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
new file mode 100644
index 000000000000..c1c8cc1363ef
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -0,0 +1,7572 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s
+
+define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
+; GFX6-LABEL: s_fshr_i7:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    s_sub_i32 s3, 0, 7
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX6-NEXT:    s_movk_i32 s3, 0x7f
+; GFX6-NEXT:    s_and_b32 s2, s2, s3
+; GFX6-NEXT:    s_and_b32 s1, s1, s3
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 7
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 7, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 7, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 6, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX6-NEXT:    v_lshl_b32_e32 v1, s0, v1
+; GFX6-NEXT:    v_lshr_b32_e32 v0, s1, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_i7:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    s_sub_i32 s3, 0, 7
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX8-NEXT:    s_movk_i32 s3, 0x7f
+; GFX8-NEXT:    s_and_b32 s2, s2, s3
+; GFX8-NEXT:    s_and_b32 s1, s1, s3
+; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 7
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, 7, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, 7, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_sub_u16_e32 v1, 6, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX8-NEXT:    v_lshlrev_b16_e64 v1, v1, s0
+; GFX8-NEXT:    v_lshrrev_b16_e64 v0, v0, s1
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_i7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_sub_i32 s3, 0, 7
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX9-NEXT:    s_movk_i32 s3, 0x7f
+; GFX9-NEXT:    s_and_b32 s2, s2, s3
+; GFX9-NEXT:    s_and_b32 s1, s1, s3
+; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 7
+; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v1, 7, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v1, 7, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_sub_u16_e32 v1, 6, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX9-NEXT:    v_lshlrev_b16_e64 v1, v1, s0
+; GFX9-NEXT:    v_lshrrev_b16_e64 v0, v0, s1
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_i7:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, 7
+; GFX10-NEXT:    s_sub_i32 s3, 0, 7
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX10-NEXT:    s_movk_i32 s3, 0x7f
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    s_and_b32 s1, s1, s3
+; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX10-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 7
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 7, v0
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 7, v0
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u16_e64 v1, 6, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX10-NEXT:    v_lshrrev_b16_e64 v0, v0, s1
+; GFX10-NEXT:    v_lshlrev_b16_e64 v1, v1, s0
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt)
+  ret i7 %result
+}
+
+define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
+; GFX6-LABEL: v_fshr_i7:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX6-NEXT:    s_sub_i32 s4, 0, 7
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7f, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, 0x7f
+; GFX6-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX6-NEXT:    v_mul_lo_u32 v3, v3, 7
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 7, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 7, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 6, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX6-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_i7:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX8-NEXT:    s_sub_i32 s4, 0, 7
+; GFX8-NEXT:    v_and_b32_e32 v2, 0x7f, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX8-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GFX8-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
+; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7f
+; GFX8-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_mul_lo_u32 v3, v3, 7
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 7, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 7, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_sub_u16_e32 v3, 6, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX8-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_i7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX9-NEXT:    s_sub_i32 s4, 0, 7
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x7f, v2
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
+; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7f
+; GFX9-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 7
+; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v3, 7, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v3, 7, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 7, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_sub_u16_e32 v3, 6, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX9-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
+; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_i7:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
+; GFX10-NEXT:    s_sub_i32 s4, 0, 7
+; GFX10-NEXT:    v_and_b32_e32 v2, 0x7f, v2
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0x7f, v1
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX10-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GFX10-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v4
+; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX10-NEXT:    v_mul_lo_u32 v3, v3, 7
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 7, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 7, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 7, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7f
+; GFX10-NEXT:    v_sub_nc_u16_e64 v4, 6, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, v2, v3
+; GFX10-NEXT:    v_and_b32_e32 v7, v4, v3
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, v2, v1
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, v7, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt)
+  ret i7 %result
+}
+
+define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
+; GFX6-LABEL: s_fshr_i8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s3, s2, 7
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX6-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX6-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_i8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_and_b32 s3, s2, 7
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_and_b32 s3, s2, 7
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10-NEXT:    s_and_b32 s3, s2, 7
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 %amt)
+  ret i8 %result
+}
+
+define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
+; GFX6-LABEL: v_fshr_i8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v3, 7, v2
+; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_i8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v3, 7, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 7, v2
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, v2, v1
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, v3, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 %amt)
+  ret i8 %result
+}
+
+define amdgpu_ps i8 @s_fshr_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
+; GFX6-LABEL: s_fshr_i8_4:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 4
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_i8_4:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX8-NEXT:    s_lshr_b32 s1, s1, 4
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_i8_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 4
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_i8_4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 4
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 4)
+  ret i8 %result
+}
+
+define i8 @v_fshr_i8_4(i8 %lhs, i8 %rhs) {
+; GFX6-LABEL: v_fshr_i8_4:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 4, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_i8_4:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, 4
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_i8_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 4
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_i8_4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, 4, v0
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, 4, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 4)
+  ret i8 %result
+}
+
+define amdgpu_ps i8 @s_fshr_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
+; GFX6-LABEL: s_fshr_i8_5:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 5
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_i8_5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX8-NEXT:    s_lshr_b32 s1, s1, 5
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_i8_5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 5
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_i8_5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 5
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 5)
+  ret i8 %result
+}
+
+define i8 @v_fshr_i8_5(i8 %lhs, i8 %rhs) {
+; GFX6-LABEL: v_fshr_i8_5:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 5, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_i8_5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, 5
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 3, v0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_i8_5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, 5
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 3, v0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_i8_5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, 3, v0
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, 5, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 5)
+  ret i8 %result
+}
+
+define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 inreg %amt.arg) {
+; GFX6-LABEL: s_fshr_v2i8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_movk_i32 s7, 0xff
+; GFX6-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX6-NEXT:    s_lshr_b32 s4, s1, 8
+; GFX6-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX6-NEXT:    s_and_b32 s6, s2, 7
+; GFX6-NEXT:    s_and_b32 s1, s1, s7
+; GFX6-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX6-NEXT:    s_lshr_b32 s1, s1, s6
+; GFX6-NEXT:    s_andn2_b32 s2, 7, s5
+; GFX6-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX6-NEXT:    s_and_b32 s1, s5, 7
+; GFX6-NEXT:    s_and_b32 s3, s4, s7
+; GFX6-NEXT:    s_lshr_b32 s1, s3, s1
+; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_and_b32 s1, s1, s7
+; GFX6-NEXT:    s_and_b32 s0, s0, s7
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_v2i8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX8-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX8-NEXT:    s_and_b32 s6, s2, 7
+; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    s_movk_i32 s2, 0xff
+; GFX8-NEXT:    s_lshr_b32 s4, s1, 8
+; GFX8-NEXT:    s_and_b32 s1, s1, s2
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_and_b32 s4, s4, s2
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s6
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s5, 7
+; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT:    s_andn2_b32 s5, 7, s5
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX8-NEXT:    s_lshl_b32 s3, s3, s5
+; GFX8-NEXT:    s_lshr_b32 s1, s4, s1
+; GFX8-NEXT:    s_or_b32 s1, s3, s1
+; GFX8-NEXT:    s_and_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s1, s1, s2
+; GFX8-NEXT:    s_bfe_u32 s2, 8, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_v2i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX9-NEXT:    s_and_b32 s6, s2, 7
+; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX9-NEXT:    s_movk_i32 s2, 0xff
+; GFX9-NEXT:    s_lshr_b32 s4, s1, 8
+; GFX9-NEXT:    s_and_b32 s1, s1, s2
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_and_b32 s4, s4, s2
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s6
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s5, 7
+; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX9-NEXT:    s_andn2_b32 s5, 7, s5
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX9-NEXT:    s_lshl_b32 s3, s3, s5
+; GFX9-NEXT:    s_lshr_b32 s1, s4, s1
+; GFX9-NEXT:    s_or_b32 s1, s3, s1
+; GFX9-NEXT:    s_and_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s1, s1, s2
+; GFX9-NEXT:    s_bfe_u32 s2, 8, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_v2i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s4, s1, 8
+; GFX10-NEXT:    s_movk_i32 s7, 0xff
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX10-NEXT:    s_and_b32 s4, s4, s7
+; GFX10-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX10-NEXT:    s_and_b32 s6, s2, 7
+; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    s_and_b32 s1, s1, s7
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX10-NEXT:    s_and_b32 s2, s5, 7
+; GFX10-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX10-NEXT:    s_andn2_b32 s5, 7, s5
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s3, s3, s5
+; GFX10-NEXT:    s_lshr_b32 s2, s4, s2
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s6
+; GFX10-NEXT:    s_or_b32 s2, s3, s2
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    s_and_b32 s1, s2, s7
+; GFX10-NEXT:    s_bfe_u32 s2, 8, 0x100000
+; GFX10-NEXT:    s_and_b32 s0, s0, s7
+; GFX10-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %lhs = bitcast i16 %lhs.arg to <2 x i8>
+  %rhs = bitcast i16 %rhs.arg to <2 x i8>
+  %amt = bitcast i16 %amt.arg to <2 x i8>
+  %result = call <2 x i8> @llvm.fshr.v2i8(<2 x i8> %lhs, <2 x i8> %rhs, <2 x i8> %amt)
+  %cast.result = bitcast <2 x i8> %result to i16
+  ret i16 %cast.result
+}
+
+define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
+; GFX6-LABEL: v_fshr_v2i8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 7, v2
+; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    s_movk_i32 s4, 0xff
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v6, v1
+; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v2, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 7, v5
+; GFX6-NEXT:    v_and_b32_e32 v3, s4, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v1, v3
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_v2i8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX8-NEXT:    v_and_b32_e32 v6, 7, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v5
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 7, v5
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, v2, v3
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_v2i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 7, v2
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v5
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 7, v5
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, v2, v3
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_v2i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX10-NEXT:    s_movk_i32 s4, 0xff
+; GFX10-NEXT:    v_and_b32_e32 v7, 7, v2
+; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v3
+; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX10-NEXT:    v_and_b32_e32 v5, s4, v5
+; GFX10-NEXT:    v_lshlrev_b16_e64 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT:    v_lshrrev_b16_e64 v3, v3, v5
+; GFX10-NEXT:    v_lshlrev_b16_e64 v4, v6, v4
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, v7, v1
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, v2, v0
+; GFX10-NEXT:    v_or_b32_e32 v2, v4, v3
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_sdwa v1, v2, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %lhs = bitcast i16 %lhs.arg to <2 x i8>
+  %rhs = bitcast i16 %rhs.arg to <2 x i8>
+  %amt = bitcast i16 %amt.arg to <2 x i8>
+  %result = call <2 x i8> @llvm.fshr.v2i8(<2 x i8> %lhs, <2 x i8> %rhs, <2 x i8> %amt)
+  %cast.result = bitcast <2 x i8> %result to i16
+  ret i16 %cast.result
+}
+
+define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 inreg %amt.arg) {
+; GFX6-LABEL: s_fshr_v4i8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_movk_i32 s13, 0xff
+; GFX6-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX6-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX6-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX6-NEXT:    s_lshr_b32 s6, s1, 8
+; GFX6-NEXT:    s_lshr_b32 s7, s1, 16
+; GFX6-NEXT:    s_lshr_b32 s8, s1, 24
+; GFX6-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX6-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX6-NEXT:    s_lshr_b32 s11, s2, 24
+; GFX6-NEXT:    s_and_b32 s12, s2, 7
+; GFX6-NEXT:    s_and_b32 s1, s1, s13
+; GFX6-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX6-NEXT:    s_lshr_b32 s1, s1, s12
+; GFX6-NEXT:    s_andn2_b32 s2, 7, s9
+; GFX6-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX6-NEXT:    s_and_b32 s1, s9, 7
+; GFX6-NEXT:    s_and_b32 s3, s6, s13
+; GFX6-NEXT:    s_lshr_b32 s1, s3, s1
+; GFX6-NEXT:    s_andn2_b32 s3, 7, s10
+; GFX6-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_lshl_b32 s3, s4, s3
+; GFX6-NEXT:    s_and_b32 s2, s10, 7
+; GFX6-NEXT:    s_and_b32 s4, s7, s13
+; GFX6-NEXT:    s_lshr_b32 s2, s4, s2
+; GFX6-NEXT:    s_and_b32 s1, s1, s13
+; GFX6-NEXT:    s_or_b32 s2, s3, s2
+; GFX6-NEXT:    s_and_b32 s3, s11, 7
+; GFX6-NEXT:    s_andn2_b32 s4, 7, s11
+; GFX6-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX6-NEXT:    s_and_b32 s0, s0, s13
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_and_b32 s1, s2, s13
+; GFX6-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX6-NEXT:    s_lshr_b32 s3, s8, s3
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_or_b32 s3, s4, s3
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_and_b32 s1, s3, s13
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_v4i8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_movk_i32 s13, 0xff
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX8-NEXT:    s_lshr_b32 s6, s1, 8
+; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s8, s1, 24
+; GFX8-NEXT:    s_and_b32 s1, s1, s13
+; GFX8-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX8-NEXT:    s_lshr_b32 s11, s2, 24
+; GFX8-NEXT:    s_and_b32 s12, s2, 7
+; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    s_andn2_b32 s2, 7, s9
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s12
+; GFX8-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX8-NEXT:    s_and_b32 s3, s6, s13
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s9, 7
+; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s1, s3, s1
+; GFX8-NEXT:    s_andn2_b32 s3, 7, s10
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX8-NEXT:    s_lshl_b32 s3, s4, s3
+; GFX8-NEXT:    s_and_b32 s4, s7, s13
+; GFX8-NEXT:    s_or_b32 s1, s2, s1
+; GFX8-NEXT:    s_and_b32 s2, s10, 7
+; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s2, s4, s2
+; GFX8-NEXT:    s_and_b32 s1, s1, s13
+; GFX8-NEXT:    s_or_b32 s2, s3, s2
+; GFX8-NEXT:    s_and_b32 s3, s11, 7
+; GFX8-NEXT:    s_andn2_b32 s4, 7, s11
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX8-NEXT:    s_and_b32 s0, s0, s13
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s2, s13
+; GFX8-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX8-NEXT:    s_lshr_b32 s3, s8, s3
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s3, s4, s3
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s3, s13
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_v4i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_movk_i32 s13, 0xff
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX9-NEXT:    s_lshr_b32 s6, s1, 8
+; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s8, s1, 24
+; GFX9-NEXT:    s_and_b32 s1, s1, s13
+; GFX9-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX9-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX9-NEXT:    s_lshr_b32 s11, s2, 24
+; GFX9-NEXT:    s_and_b32 s12, s2, 7
+; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX9-NEXT:    s_andn2_b32 s2, 7, s9
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s12
+; GFX9-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX9-NEXT:    s_and_b32 s3, s6, s13
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s9, 7
+; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s1, s3, s1
+; GFX9-NEXT:    s_andn2_b32 s3, 7, s10
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX9-NEXT:    s_lshl_b32 s3, s4, s3
+; GFX9-NEXT:    s_and_b32 s4, s7, s13
+; GFX9-NEXT:    s_or_b32 s1, s2, s1
+; GFX9-NEXT:    s_and_b32 s2, s10, 7
+; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s2, s4, s2
+; GFX9-NEXT:    s_and_b32 s1, s1, s13
+; GFX9-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-NEXT:    s_and_b32 s3, s11, 7
+; GFX9-NEXT:    s_andn2_b32 s4, 7, s11
+; GFX9-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX9-NEXT:    s_and_b32 s0, s0, s13
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s2, s13
+; GFX9-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX9-NEXT:    s_lshr_b32 s3, s8, s3
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX9-NEXT:    s_or_b32 s3, s4, s3
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s3, s13
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_v4i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s6, s1, 8
+; GFX10-NEXT:    s_movk_i32 s13, 0xff
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX10-NEXT:    s_and_b32 s6, s6, s13
+; GFX10-NEXT:    s_lshr_b32 s7, s1, 16
+; GFX10-NEXT:    s_lshr_b32 s8, s1, 24
+; GFX10-NEXT:    s_and_b32 s1, s1, s13
+; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX10-NEXT:    s_lshr_b32 s11, s2, 24
+; GFX10-NEXT:    s_and_b32 s12, s2, 7
+; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX10-NEXT:    s_and_b32 s2, s9, 7
+; GFX10-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX10-NEXT:    s_andn2_b32 s9, 7, s9
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s12
+; GFX10-NEXT:    s_lshr_b32 s2, s6, s2
+; GFX10-NEXT:    s_and_b32 s6, s7, s13
+; GFX10-NEXT:    s_lshl_b32 s3, s3, s9
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    s_or_b32 s1, s3, s2
+; GFX10-NEXT:    s_and_b32 s2, s10, 7
+; GFX10-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX10-NEXT:    s_andn2_b32 s3, 7, s10
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX10-NEXT:    s_lshr_b32 s2, s6, s2
+; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
+; GFX10-NEXT:    s_andn2_b32 s4, 7, s11
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX10-NEXT:    s_and_b32 s6, s11, 7
+; GFX10-NEXT:    s_or_b32 s2, s3, s2
+; GFX10-NEXT:    s_and_b32 s1, s1, s13
+; GFX10-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX10-NEXT:    s_lshr_b32 s5, s8, s6
+; GFX10-NEXT:    s_and_b32 s0, s0, s13
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX10-NEXT:    s_or_b32 s3, s4, s5
+; GFX10-NEXT:    s_and_b32 s2, s2, s13
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    s_lshl_b32 s1, s2, 16
+; GFX10-NEXT:    s_and_b32 s2, s3, s13
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    s_lshl_b32 s1, s2, 24
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %lhs = bitcast i32 %lhs.arg to <4 x i8>
+  %rhs = bitcast i32 %rhs.arg to <4 x i8>
+  %amt = bitcast i32 %amt.arg to <4 x i8>
+  %result = call <4 x i8> @llvm.fshr.v4i8(<4 x i8> %lhs, <4 x i8> %rhs, <4 x i8> %amt)
+  %cast.result = bitcast <4 x i8> %result to i32
+  ret i32 %cast.result
+}
+
+define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
+; GFX6-LABEL: v_fshr_v4i8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX6-NEXT:    v_and_b32_e32 v12, 7, v2
+; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX6-NEXT:    s_movk_i32 s4, 0xff
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v12, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 7, v9
+; GFX6-NEXT:    v_xor_b32_e32 v9, -1, v9
+; GFX6-NEXT:    v_and_b32_e32 v6, s4, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v1, v6
+; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v10
+; GFX6-NEXT:    v_and_b32_e32 v9, 7, v9
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
+; GFX6-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v9, v3
+; GFX6-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v6, v4
+; GFX6-NEXT:    v_and_b32_e32 v3, 7, v10
+; GFX6-NEXT:    v_and_b32_e32 v6, v7, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v3, v6
+; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v11
+; GFX6-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX6-NEXT:    v_and_b32_e32 v4, 7, v11
+; GFX6-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 1, v5
+; GFX6-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, v3, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, v6, v5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v4, v8
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, v4, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_v4i8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v8, 7, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 1, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, v2, v9
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v8
+; GFX8-NEXT:    v_and_b32_e32 v8, 7, v5
+; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
+; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
+; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v5, v3
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v6
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, 7, v6
+; GFX8-NEXT:    v_mov_b32_e32 v6, 1
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0xff
+; GFX8-NEXT:    v_lshlrev_b16_e32 v5, v5, v8
+; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v4, v8
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 7, v7
+; GFX8-NEXT:    v_xor_b32_e32 v7, -1, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v7, v0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v2, s4, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_v4i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v8, 7, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_lshlrev_b16_e32 v9, 1, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, v2, v9
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, v2, v8
+; GFX9-NEXT:    v_and_b32_e32 v8, 7, v5
+; GFX9-NEXT:    v_xor_b32_e32 v5, -1, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
+; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
+; GFX9-NEXT:    v_lshlrev_b16_e32 v3, v5, v3
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_xor_b32_e32 v5, -1, v6
+; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 7, v6
+; GFX9-NEXT:    v_mov_b32_e32 v6, 1
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0xff
+; GFX9-NEXT:    v_lshlrev_b16_e32 v5, v5, v8
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b16_e32 v4, v4, v8
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 7, v7
+; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v7
+; GFX9-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v7, v0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 8
+; GFX9-NEXT:    s_movk_i32 s4, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v2, s4, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, s4, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_v4i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX10-NEXT:    v_and_b32_e32 v15, 7, v8
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v14, 7, v11
+; GFX10-NEXT:    v_lshlrev_b16_e64 v3, 1, v3
+; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, v15, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, 0xff
+; GFX10-NEXT:    v_lshlrev_b16_e64 v3, v14, v3
+; GFX10-NEXT:    v_xor_b32_e32 v14, -1, v12
+; GFX10-NEXT:    s_movk_i32 s4, 0xff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX10-NEXT:    v_and_b32_e32 v8, s4, v1
+; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
+; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX10-NEXT:    v_and_b32_e32 v7, s4, v7
+; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
+; GFX10-NEXT:    v_lshlrev_b16_e64 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v15, 7, v14
+; GFX10-NEXT:    v_lshlrev_b16_e64 v5, 1, v5
+; GFX10-NEXT:    v_and_b32_e32 v12, 7, v12
+; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT:    v_lshrrev_b16_e64 v6, v6, v7
+; GFX10-NEXT:    v_lshlrev_b16_e64 v4, v11, v4
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, v10, v1
+; GFX10-NEXT:    v_lshlrev_b16_e64 v5, v15, v5
+; GFX10-NEXT:    v_lshrrev_b16_e64 v7, v12, v9
+; GFX10-NEXT:    v_lshrrev_b16_e64 v2, v2, v8
+; GFX10-NEXT:    v_or_b32_e32 v3, v3, v6
+; GFX10-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v6, 8
+; GFX10-NEXT:    v_or_b32_e32 v4, v5, v7
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_and_b32_e32 v3, s4, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %lhs = bitcast i32 %lhs.arg to <4 x i8>
+  %rhs = bitcast i32 %rhs.arg to <4 x i8>
+  %amt = bitcast i32 %amt.arg to <4 x i8>
+  %result = call <4 x i8> @llvm.fshr.v4i8(<4 x i8> %lhs, <4 x i8> %rhs, <4 x i8> %amt)
+  %cast.result = bitcast <4 x i8> %result to i32
+  ret i32 %cast.result
+}
+
+define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) {
+; GFX6-LABEL: s_fshr_i24:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    s_sub_i32 s3, 0, 24
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX6-NEXT:    s_mov_b32 s3, 0xffffff
+; GFX6-NEXT:    s_and_b32 s2, s2, s3
+; GFX6-NEXT:    s_and_b32 s1, s1, s3
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 24, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 24, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 23, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX6-NEXT:    v_lshl_b32_e32 v1, s0, v1
+; GFX6-NEXT:    v_lshr_b32_e32 v0, s1, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_i24:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    s_sub_i32 s3, 0, 24
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX8-NEXT:    s_mov_b32 s3, 0xffffff
+; GFX8-NEXT:    s_and_b32 s2, s2, s3
+; GFX8-NEXT:    s_and_b32 s1, s1, s3
+; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, 24, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, 24, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 23, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s1
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_i24:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_sub_i32 s3, 0, 24
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX9-NEXT:    s_mov_b32 s3, 0xffffff
+; GFX9-NEXT:    s_and_b32 s2, s2, s3
+; GFX9-NEXT:    s_and_b32 s1, s1, s3
+; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v1, 24, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v1, 24, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v1, 23, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s1
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, v1, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_i24:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
+; GFX10-NEXT:    s_sub_i32 s3, 0, 24
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX10-NEXT:    s_mov_b32 s3, 0xffffff
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    s_and_b32 s1, s1, s3
+; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX10-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 24, v0
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 24, v0
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 23, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, s1
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, v1, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i24 @llvm.fshr.i24(i24 %lhs, i24 %rhs, i24 %amt)
+  ret i24 %result
+}
+
+define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
+; GFX6-LABEL: v_fshr_i24:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX6-NEXT:    s_sub_i32 s4, 0, 24
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffffff
+; GFX6-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX6-NEXT:    v_mul_lo_u32 v3, v3, 24
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX6-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_i24:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX8-NEXT:    s_sub_i32 s4, 0, 24
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX8-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GFX8-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
+; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0xffffff
+; GFX8-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_mul_lo_u32 v3, v3, 24
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX8-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_i24:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX9-NEXT:    s_sub_i32 s4, 0, 24
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
+; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffff
+; GFX9-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 24
+; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v3, 23, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX9-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v3, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_i24:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
+; GFX10-NEXT:    s_sub_i32 s4, 0, 24
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX10-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GFX10-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffffff
+; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX10-NEXT:    v_mul_lo_u32 v3, v3, 24
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
+; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v3, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i24 @llvm.fshr.i24(i24 %lhs, i24 %rhs, i24 %amt)
+  ret i24 %result
+}
+
+define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 inreg %amt.arg) {
+; GFX6-LABEL: s_fshr_v2i24:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_movk_i32 s10, 0xff
+; GFX6-NEXT:    s_lshr_b32 s9, s1, 8
+; GFX6-NEXT:    s_and_b32 s1, s1, s10
+; GFX6-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX6-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX6-NEXT:    s_or_b32 s1, s8, s1
+; GFX6-NEXT:    s_and_b32 s6, s6, s10
+; GFX6-NEXT:    s_lshr_b32 s8, s2, 8
+; GFX6-NEXT:    s_and_b32 s8, s8, s10
+; GFX6-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX6-NEXT:    s_and_b32 s0, s0, s10
+; GFX6-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX6-NEXT:    s_or_b32 s0, s0, s6
+; GFX6-NEXT:    s_and_b32 s6, s7, s10
+; GFX6-NEXT:    s_and_b32 s7, s9, s10
+; GFX6-NEXT:    s_lshr_b32 s9, s2, 16
+; GFX6-NEXT:    s_lshr_b32 s11, s2, 24
+; GFX6-NEXT:    s_and_b32 s2, s2, s10
+; GFX6-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX6-NEXT:    s_or_b32 s2, s2, s8
+; GFX6-NEXT:    s_and_b32 s8, s9, s10
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
+; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    s_lshr_b32 s12, s3, 8
+; GFX6-NEXT:    s_and_b32 s3, s3, s10
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX6-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX6-NEXT:    s_or_b32 s2, s2, s8
+; GFX6-NEXT:    s_and_b32 s8, s12, s10
+; GFX6-NEXT:    s_or_b32 s3, s11, s3
+; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX6-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX6-NEXT:    s_or_b32 s3, s3, s8
+; GFX6-NEXT:    s_lshr_b32 s8, s4, 8
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    s_and_b32 s8, s8, s10
+; GFX6-NEXT:    s_lshr_b32 s9, s4, 16
+; GFX6-NEXT:    s_lshr_b32 s11, s4, 24
+; GFX6-NEXT:    s_and_b32 s4, s4, s10
+; GFX6-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX6-NEXT:    s_or_b32 s4, s4, s8
+; GFX6-NEXT:    s_and_b32 s8, s9, s10
+; GFX6-NEXT:    s_sub_i32 s9, 0, 24
+; GFX6-NEXT:    v_mul_lo_u32 v1, s9, v0
+; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_or_b32 s4, s4, s8
+; GFX6-NEXT:    s_lshr_b32 s12, s5, 8
+; GFX6-NEXT:    s_and_b32 s5, s5, s10
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v1, 24
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX6-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX6-NEXT:    s_and_b32 s8, s12, s10
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v1
+; GFX6-NEXT:    s_or_b32 s5, s11, s5
+; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
+; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX6-NEXT:    s_or_b32 s5, s5, s8
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
+; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX6-NEXT:    s_mov_b32 s8, 0xffffff
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX6-NEXT:    s_lshl_b32 s4, s6, 17
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
+; GFX6-NEXT:    s_or_b32 s0, s4, s0
+; GFX6-NEXT:    v_and_b32_e32 v2, s8, v3
+; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
+; GFX6-NEXT:    v_lshr_b32_e32 v0, s2, v0
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v1
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v1
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffffff
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 23, v1
+; GFX6-NEXT:    s_lshl_b32 s0, s7, 17
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX6-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
+; GFX6-NEXT:    v_lshr_b32_e32 v1, s3, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, s10, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, s10, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, s10, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, s10, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, s10, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, s10, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_v2i24:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_movk_i32 s10, 0xff
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 8
+; GFX8-NEXT:    s_bfe_u32 s11, 8, 0x100000
+; GFX8-NEXT:    s_and_b32 s1, s1, s10
+; GFX8-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX8-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s11
+; GFX8-NEXT:    s_or_b32 s1, s8, s1
+; GFX8-NEXT:    s_and_b32 s6, s6, s10
+; GFX8-NEXT:    s_lshr_b32 s8, s2, 8
+; GFX8-NEXT:    s_and_b32 s8, s8, s10
+; GFX8-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, s10
+; GFX8-NEXT:    s_lshl_b32 s6, s6, s11
+; GFX8-NEXT:    s_or_b32 s0, s0, s6
+; GFX8-NEXT:    s_and_b32 s6, s7, s10
+; GFX8-NEXT:    s_and_b32 s7, s9, s10
+; GFX8-NEXT:    s_lshr_b32 s9, s2, 16
+; GFX8-NEXT:    s_lshr_b32 s12, s2, 24
+; GFX8-NEXT:    s_and_b32 s2, s2, s10
+; GFX8-NEXT:    s_lshl_b32 s8, s8, s11
+; GFX8-NEXT:    s_or_b32 s2, s2, s8
+; GFX8-NEXT:    s_and_b32 s8, s9, s10
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
+; GFX8-NEXT:    s_bfe_u32 s8, s8, 0x100000
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    s_lshr_b32 s13, s3, 8
+; GFX8-NEXT:    s_and_b32 s3, s3, s10
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX8-NEXT:    s_lshl_b32 s3, s3, s11
+; GFX8-NEXT:    s_or_b32 s2, s2, s8
+; GFX8-NEXT:    s_and_b32 s8, s13, s10
+; GFX8-NEXT:    s_or_b32 s3, s12, s3
+; GFX8-NEXT:    s_bfe_u32 s8, s8, 0x100000
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX8-NEXT:    s_or_b32 s3, s3, s8
+; GFX8-NEXT:    s_lshr_b32 s8, s4, 8
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX8-NEXT:    s_and_b32 s8, s8, s10
+; GFX8-NEXT:    s_lshr_b32 s9, s4, 16
+; GFX8-NEXT:    s_lshr_b32 s12, s4, 24
+; GFX8-NEXT:    s_and_b32 s4, s4, s10
+; GFX8-NEXT:    s_lshl_b32 s8, s8, s11
+; GFX8-NEXT:    s_or_b32 s4, s4, s8
+; GFX8-NEXT:    s_and_b32 s8, s9, s10
+; GFX8-NEXT:    s_sub_i32 s9, 0, 24
+; GFX8-NEXT:    v_mul_lo_u32 v1, s9, v0
+; GFX8-NEXT:    s_bfe_u32 s8, s8, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX8-NEXT:    s_or_b32 s4, s4, s8
+; GFX8-NEXT:    s_lshr_b32 s13, s5, 8
+; GFX8-NEXT:    s_and_b32 s5, s5, s10
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v1, 24
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX8-NEXT:    s_lshl_b32 s5, s5, s11
+; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX8-NEXT:    s_and_b32 s8, s13, s10
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v2, s9, v1
+; GFX8-NEXT:    s_or_b32 s5, s12, s5
+; GFX8-NEXT:    s_bfe_u32 s8, s8, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s5, s5, 0x100000
+; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX8-NEXT:    s_or_b32 s5, s5, s8
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    v_mul_hi_u32 v1, s5, v1
+; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX8-NEXT:    s_mov_b32 s8, 0xffffff
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v0
+; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX8-NEXT:    s_lshl_b32 s4, s6, 17
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    v_and_b32_e32 v0, s8, v0
+; GFX8-NEXT:    s_or_b32 s0, s4, s0
+; GFX8-NEXT:    v_and_b32_e32 v2, s8, v3
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s2
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s5, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v1
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v1
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0xffffff
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 23, v1
+; GFX8-NEXT:    s_lshl_b32 s0, s7, 17
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX8-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
+; GFX8-NEXT:    v_lshrrev_b32_e64 v1, v1, s3
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s10
+; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, s10, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_v2i24:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_movk_i32 s11, 0xff
+; GFX9-NEXT:    s_lshr_b32 s10, s1, 8
+; GFX9-NEXT:    s_bfe_u32 s12, 8, 0x100000
+; GFX9-NEXT:    s_and_b32 s1, s1, s11
+; GFX9-NEXT:    s_lshr_b32 s7, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s9, s0, 24
+; GFX9-NEXT:    s_lshl_b32 s1, s1, s12
+; GFX9-NEXT:    s_or_b32 s1, s9, s1
+; GFX9-NEXT:    s_and_b32 s7, s7, s11
+; GFX9-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX9-NEXT:    s_and_b32 s9, s9, s11
+; GFX9-NEXT:    s_lshr_b32 s8, s0, 16
+; GFX9-NEXT:    s_and_b32 s0, s0, s11
+; GFX9-NEXT:    s_lshl_b32 s7, s7, s12
+; GFX9-NEXT:    s_or_b32 s0, s0, s7
+; GFX9-NEXT:    s_and_b32 s7, s8, s11
+; GFX9-NEXT:    s_and_b32 s8, s10, s11
+; GFX9-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX9-NEXT:    s_lshr_b32 s13, s2, 24
+; GFX9-NEXT:    s_and_b32 s2, s2, s11
+; GFX9-NEXT:    s_lshl_b32 s9, s9, s12
+; GFX9-NEXT:    s_or_b32 s2, s2, s9
+; GFX9-NEXT:    s_and_b32 s9, s10, s11
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
+; GFX9-NEXT:    s_bfe_u32 s9, s9, 0x100000
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_lshr_b32 s14, s3, 8
+; GFX9-NEXT:    s_and_b32 s3, s3, s11
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 16
+; GFX9-NEXT:    s_lshl_b32 s3, s3, s12
+; GFX9-NEXT:    s_or_b32 s2, s2, s9
+; GFX9-NEXT:    s_and_b32 s9, s14, s11
+; GFX9-NEXT:    s_or_b32 s3, s13, s3
+; GFX9-NEXT:    s_bfe_u32 s9, s9, 0x100000
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 16
+; GFX9-NEXT:    s_or_b32 s3, s3, s9
+; GFX9-NEXT:    s_lshr_b32 s9, s4, 8
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    s_and_b32 s9, s9, s11
+; GFX9-NEXT:    s_lshr_b32 s10, s4, 16
+; GFX9-NEXT:    s_lshr_b32 s13, s4, 24
+; GFX9-NEXT:    s_and_b32 s4, s4, s11
+; GFX9-NEXT:    s_lshl_b32 s9, s9, s12
+; GFX9-NEXT:    s_or_b32 s4, s4, s9
+; GFX9-NEXT:    s_and_b32 s9, s10, s11
+; GFX9-NEXT:    s_sub_i32 s10, 0, 24
+; GFX9-NEXT:    v_mul_lo_u32 v1, s10, v0
+; GFX9-NEXT:    s_bfe_u32 s9, s9, 0x100000
+; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 16
+; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX9-NEXT:    s_or_b32 s4, s4, s9
+; GFX9-NEXT:    s_lshr_b32 s14, s5, 8
+; GFX9-NEXT:    s_and_b32 s5, s5, s11
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, 24
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX9-NEXT:    s_lshl_b32 s5, s5, s12
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX9-NEXT:    s_and_b32 s9, s14, s11
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v1
+; GFX9-NEXT:    s_or_b32 s5, s13, s5
+; GFX9-NEXT:    s_bfe_u32 s9, s9, 0x100000
+; GFX9-NEXT:    s_bfe_u32 s5, s5, 0x100000
+; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 16
+; GFX9-NEXT:    s_or_b32 s5, s5, s9
+; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX9-NEXT:    s_mov_b32 s9, 0xffffff
+; GFX9-NEXT:    v_sub_u32_e32 v3, 23, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, s9, v0
+; GFX9-NEXT:    s_lshl_b32 s4, s7, 17
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
+; GFX9-NEXT:    s_or_b32 s0, s4, s0
+; GFX9-NEXT:    v_and_b32_e32 v3, s9, v3
+; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s2
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, v3, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    s_bfe_u32 s8, s8, 0x100000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffffff
+; GFX9-NEXT:    v_sub_u32_e32 v3, 23, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX9-NEXT:    s_lshl_b32 s0, s8, 17
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX9-NEXT:    v_and_b32_e32 v3, v3, v2
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    v_lshrrev_b32_e64 v1, v1, s3
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, v3, v1
+; GFX9-NEXT:    s_mov_b32 s6, 8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_e32 v4, s11, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX9-NEXT:    v_and_or_b32 v2, v0, s11, v2
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX9-NEXT:    v_or3_b32 v0, v2, v0, v4
+; GFX9-NEXT:    v_and_or_b32 v1, v3, s11, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_v2i24:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, 24
+; GFX10-NEXT:    s_sub_i32 s12, 0, 24
+; GFX10-NEXT:    s_movk_i32 s9, 0xff
+; GFX10-NEXT:    s_lshr_b32 s14, s4, 8
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX10-NEXT:    s_lshr_b32 s15, s4, 16
+; GFX10-NEXT:    s_bfe_u32 s10, 8, 0x100000
+; GFX10-NEXT:    s_and_b32 s14, s14, s9
+; GFX10-NEXT:    s_and_b32 s16, s4, s9
+; GFX10-NEXT:    s_lshl_b32 s14, s14, s10
+; GFX10-NEXT:    s_and_b32 s15, s15, s9
+; GFX10-NEXT:    s_or_b32 s14, s16, s14
+; GFX10-NEXT:    s_lshr_b32 s4, s4, 24
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX10-NEXT:    s_bfe_u32 s14, s14, 0x100000
+; GFX10-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX10-NEXT:    s_lshr_b32 s11, s1, 8
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-NEXT:    s_and_b32 s1, s1, s9
+; GFX10-NEXT:    s_and_b32 s6, s6, s9
+; GFX10-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX10-NEXT:    v_mul_lo_u32 v2, s12, v0
+; GFX10-NEXT:    v_mul_lo_u32 v3, s12, v1
+; GFX10-NEXT:    s_bfe_u32 s12, s15, 0x100000
+; GFX10-NEXT:    s_lshr_b32 s15, s5, 8
+; GFX10-NEXT:    s_lshl_b32 s12, s12, 16
+; GFX10-NEXT:    s_and_b32 s5, s5, s9
+; GFX10-NEXT:    s_or_b32 s12, s14, s12
+; GFX10-NEXT:    s_lshl_b32 s5, s5, s10
+; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX10-NEXT:    s_and_b32 s14, s15, s9
+; GFX10-NEXT:    s_or_b32 s4, s4, s5
+; GFX10-NEXT:    s_bfe_u32 s5, s14, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX10-NEXT:    s_lshl_b32 s1, s1, s10
+; GFX10-NEXT:    s_or_b32 s4, s4, s5
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX10-NEXT:    v_mul_hi_u32 v2, v1, v3
+; GFX10-NEXT:    s_or_b32 s1, s8, s1
+; GFX10-NEXT:    s_lshr_b32 s8, s2, 8
+; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX10-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX10-NEXT:    s_and_b32 s0, s0, s9
+; GFX10-NEXT:    s_lshl_b32 s6, s6, s10
+; GFX10-NEXT:    s_and_b32 s8, s8, s9
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX10-NEXT:    s_or_b32 s0, s0, s6
+; GFX10-NEXT:    s_and_b32 s6, s7, s9
+; GFX10-NEXT:    s_and_b32 s7, s11, s9
+; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX10-NEXT:    v_mul_hi_u32 v1, s4, v1
+; GFX10-NEXT:    s_lshr_b32 s11, s2, 16
+; GFX10-NEXT:    s_and_b32 s13, s2, s9
+; GFX10-NEXT:    s_lshl_b32 s5, s8, s10
+; GFX10-NEXT:    s_and_b32 s8, s11, s9
+; GFX10-NEXT:    s_lshr_b32 s11, s3, 8
+; GFX10-NEXT:    s_and_b32 s3, s3, s9
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s12, v0
+; GFX10-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX10-NEXT:    s_or_b32 s5, s13, s5
+; GFX10-NEXT:    s_bfe_u32 s8, s8, 0x100000
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX10-NEXT:    s_lshl_b32 s3, s3, s10
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffffff
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
+; GFX10-NEXT:    s_mov_b32 s4, 0xffffff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    s_bfe_u32 s5, s5, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v1
+; GFX10-NEXT:    s_or_b32 s2, s2, s3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX10-NEXT:    s_and_b32 s3, s11, s9
+; GFX10-NEXT:    s_or_b32 s5, s5, s8
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
+; GFX10-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 23, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 17
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v1
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
+; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, s5
+; GFX10-NEXT:    s_or_b32 s0, s6, s0
+; GFX10-NEXT:    s_or_b32 s2, s2, s3
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, v2, v0
+; GFX10-NEXT:    s_lshl_b32 s0, s7, 17
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    v_and_b32_e32 v2, v3, v4
+; GFX10-NEXT:    v_lshrrev_b32_e64 v1, v1, s2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, v2, v1
+; GFX10-NEXT:    s_mov_b32 s0, 8
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_and_b32_e32 v3, s9, v1
+; GFX10-NEXT:    v_and_b32_sdwa v4, v1, s9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX10-NEXT:    v_and_or_b32 v2, v0, s9, v2
+; GFX10-NEXT:    v_and_b32_sdwa v0, v0, s9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX10-NEXT:    v_or3_b32 v0, v2, v0, v3
+; GFX10-NEXT:    v_and_or_b32 v1, v1, s9, v4
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-NEXT:    ; return to shader part epilog
+  %lhs = bitcast i48 %lhs.arg to <2 x i24>
+  %rhs = bitcast i48 %rhs.arg to <2 x i24>
+  %amt = bitcast i48 %amt.arg to <2 x i24>
+  %result = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt)
+  %cast.result = bitcast <2 x i24> %result to i48
+  ret i48 %cast.result
+}
+
+define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
+; GFX6-LABEL: v_fshr_v2i24:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GFX6-NEXT:    s_sub_i32 s4, 0, 24
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v8, 24
+; GFX6-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_mul_lo_u32 v7, s4, v6
+; GFX6-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GFX6-NEXT:    v_mul_hi_u32 v6, v4, v6
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v8
+; GFX6-NEXT:    v_mov_b32_e32 v8, 0xffffff
+; GFX6-NEXT:    v_and_b32_e32 v5, v5, v8
+; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX6-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v8
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
+; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 24, v4
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 24, v4
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v6, s4, v7
+; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 23, v4
+; GFX6-NEXT:    v_and_b32_e32 v9, v9, v8
+; GFX6-NEXT:    v_and_b32_e32 v4, v4, v8
+; GFX6-NEXT:    v_mul_hi_u32 v6, v7, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v9, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX6-NEXT:    v_mul_hi_u32 v6, v5, v6
+; GFX6-NEXT:    v_and_b32_e32 v3, v3, v8
+; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v5, v6
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 24, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 24, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 23, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v8
+; GFX6-NEXT:    v_and_b32_e32 v4, v4, v8
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_v2i24:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GFX8-NEXT:    s_sub_i32 s4, 0, 24
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v8, 24
+; GFX8-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_mul_lo_u32 v7, s4, v6
+; GFX8-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
+; GFX8-NEXT:    v_mul_hi_u32 v6, v4, v6
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v7, v8
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0xffffff
+; GFX8-NEXT:    v_and_b32_e32 v5, v5, v8
+; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX8-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v8
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v6
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 24, v4
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 24, v4
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v6, s4, v7
+; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, 23, v4
+; GFX8-NEXT:    v_and_b32_e32 v9, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v4, v4, v8
+; GFX8-NEXT:    v_mul_hi_u32 v6, v7, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v9, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
+; GFX8-NEXT:    v_mul_hi_u32 v6, v5, v6
+; GFX8-NEXT:    v_and_b32_e32 v3, v3, v8
+; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v5, v6
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 24, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 24, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 23, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v8
+; GFX8-NEXT:    v_and_b32_e32 v4, v4, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_v2i24:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GFX9-NEXT:    s_sub_i32 s4, 0, 24
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v8, 24
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v8
+; GFX9-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX9-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0xffffff
+; GFX9-NEXT:    v_mul_lo_u32 v7, s4, v6
+; GFX9-NEXT:    v_and_b32_e32 v5, v5, v9
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, v3, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v7
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v8
+; GFX9-NEXT:    v_mul_hi_u32 v6, v4, v6
+; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v7
+; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX9-NEXT:    v_mul_hi_u32 v8, v7, v8
+; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v6
+; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
+; GFX9-NEXT:    v_add_u32_e32 v7, v7, v8
+; GFX9-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
+; GFX9-NEXT:    v_mul_lo_u32 v7, v7, 24
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v6, 23, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, v4, v9
+; GFX9-NEXT:    v_and_b32_e32 v6, v6, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v6, v2
+; GFX9-NEXT:    v_sub_u32_e32 v2, v5, v7
+; GFX9-NEXT:    v_subrev_u32_e32 v4, 24, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v4, 24, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v4, 23, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v9
+; GFX9-NEXT:    v_and_b32_e32 v4, v4, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
+; GFX9-NEXT:    v_lshl_or_b32 v1, v1, v4, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_v2i24:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v7, 24
+; GFX10-NEXT:    s_sub_i32 s4, 0, 24
+; GFX10-NEXT:    v_mov_b32_e32 v12, 0xffffff
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v7, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v5, v5, v12
+; GFX10-NEXT:    v_and_b32_e32 v2, v2, v12
+; GFX10-NEXT:    v_and_b32_e32 v3, v3, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GFX10-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX10-NEXT:    v_mul_lo_u32 v8, s4, v6
+; GFX10-NEXT:    v_mul_lo_u32 v9, s4, v7
+; GFX10-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GFX10-NEXT:    v_mul_hi_u32 v9, v7, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v7, v9
+; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v6
+; GFX10-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GFX10-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX10-NEXT:    v_mul_lo_u32 v7, v7, 24
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
+; GFX10-NEXT:    v_and_b32_e32 v4, v11, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 23, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v7, 23, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, v5, v12
+; GFX10-NEXT:    v_and_b32_e32 v11, v6, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, v7, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v3
+; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v11, v2
+; GFX10-NEXT:    v_lshl_or_b32 v1, v1, v4, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt)
+  ret <2 x i24> %result
+}
+
+define amdgpu_ps i32 @s_fshr_i32(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt) {
+; GFX6-LABEL: s_fshr_i32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s2
+; GFX6-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_alignbit_b32 v0, s0, s1, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt)
+  ret i32 %result
+}
+
+define amdgpu_ps i32 @s_fshr_i32_5(i32 inreg %lhs, i32 inreg %rhs) {
+; GFX6-LABEL: s_fshr_i32_5:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    v_alignbit_b32 v0, s0, v0, 5
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_i32_5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_alignbit_b32 v0, s0, v0, 5
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_i32_5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, 5
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_i32_5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_alignbit_b32 v0, s0, s1, 5
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 5)
+  ret i32 %result
+}
+
+define amdgpu_ps i32 @s_fshr_i32_8(i32 inreg %lhs, i32 inreg %rhs) {
+; GFX6-LABEL: s_fshr_i32_8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    v_alignbit_b32 v0, s0, v0, 8
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_i32_8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_alignbit_b32 v0, s0, v0, 8
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_i32_8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, 8
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_i32_8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_alignbit_b32 v0, s0, s1, 8
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 8)
+  ret i32 %result
+}
+
+define i32 @v_fshr_i32(i32 %lhs, i32 %rhs, i32 %amt) {
+; GFX6-LABEL: v_fshr_i32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt)
+  ret i32 %result
+}
+
+define i32 @v_fshr_i32_5(i32 %lhs, i32 %rhs) {
+; GFX6-LABEL: v_fshr_i32_5:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v1, 5
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_i32_5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_i32_5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v0, v0, v1, 5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_i32_5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, 5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 5)
+  ret i32 %result
+}
+
+define i32 @v_fshr_i32_8(i32 %lhs, i32 %rhs) {
+; GFX6-LABEL: v_fshr_i32_8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v1, 8
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_i32_8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 8
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_i32_8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v0, v0, v1, 8
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_i32_8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, 8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 8)
+  ret i32 %result
+}
+
+define amdgpu_ps float @v_fshr_i32_ssv(i32 inreg %lhs, i32 inreg %rhs, i32 %amt) {
+; GFX6-LABEL: v_fshr_i32_ssv:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_alignbit_b32 v0, s0, v1, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshr_i32_ssv:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_alignbit_b32 v0, s0, v1, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshr_i32_ssv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_alignbit_b32 v0, s0, v1, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshr_i32_ssv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_alignbit_b32 v0, s0, s1, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt)
+  %cast.result = bitcast i32 %result to float
+  ret float %cast.result
+}
+
+define amdgpu_ps float @v_fshr_i32_svs(i32 inreg %lhs, i32 %rhs, i32 inreg %amt) {
+; GFX6-LABEL: v_fshr_i32_svs:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshr_i32_svs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshr_i32_svs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshr_i32_svs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_alignbit_b32 v0, s0, v0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt)
+  %cast.result = bitcast i32 %result to float
+  ret float %cast.result
+}
+
+define amdgpu_ps float @v_fshr_i32_vss(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt) {
+; GFX6-LABEL: v_fshr_i32_vss:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s2
+; GFX6-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshr_i32_vss:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshr_i32_vss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshr_i32_vss:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_alignbit_b32 v0, s0, s1, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt)
+  %cast.result = bitcast i32 %result to float
+  ret float %cast.result
+}
+
+define <2 x i32> @v_fshr_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) {
+; GFX6-LABEL: v_fshr_v2i32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v2, v4
+; GFX6-NEXT:    v_alignbit_b32 v1, v1, v3, v5
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_v2i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, v4
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v3, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v0, v0, v2, v4
+; GFX9-NEXT:    v_alignbit_b32 v1, v1, v3, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_v2i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
+; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt)
+  ret <2 x i32> %result
+}
+
+define <3 x i32> @v_fshr_v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) {
+; GFX6-LABEL: v_fshr_v3i32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v3, v6
+; GFX6-NEXT:    v_alignbit_b32 v1, v1, v4, v7
+; GFX6-NEXT:    v_alignbit_b32 v2, v2, v5, v8
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_v3i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, v6
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, v7
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v5, v8
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_v3i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v0, v0, v3, v6
+; GFX9-NEXT:    v_alignbit_b32 v1, v1, v4, v7
+; GFX9-NEXT:    v_alignbit_b32 v2, v2, v5, v8
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_v3i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_alignbit_b32 v0, v0, v3, v6
+; GFX10-NEXT:    v_alignbit_b32 v1, v1, v4, v7
+; GFX10-NEXT:    v_alignbit_b32 v2, v2, v5, v8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt)
+  ret <3 x i32> %result
+}
+
+define <4 x i32> @v_fshr_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) {
+; GFX6-LABEL: v_fshr_v4i32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_alignbit_b32 v0, v0, v4, v8
+; GFX6-NEXT:    v_alignbit_b32 v1, v1, v5, v9
+; GFX6-NEXT:    v_alignbit_b32 v2, v2, v6, v10
+; GFX6-NEXT:    v_alignbit_b32 v3, v3, v7, v11
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_v4i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, v8
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v5, v9
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v6, v10
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v7, v11
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_v4i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v0, v0, v4, v8
+; GFX9-NEXT:    v_alignbit_b32 v1, v1, v5, v9
+; GFX9-NEXT:    v_alignbit_b32 v2, v2, v6, v10
+; GFX9-NEXT:    v_alignbit_b32 v3, v3, v7, v11
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_v4i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_alignbit_b32 v0, v0, v4, v8
+; GFX10-NEXT:    v_alignbit_b32 v1, v1, v5, v9
+; GFX10-NEXT:    v_alignbit_b32 v2, v2, v6, v10
+; GFX10-NEXT:    v_alignbit_b32 v3, v3, v7, v11
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt)
+  ret <4 x i32> %result
+}
+
+define amdgpu_ps i16 @s_fshr_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt) {
+; GFX6-LABEL: s_fshr_i16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s3, s2, 15
+; GFX6-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX6-NEXT:    s_bfe_u32 s2, s3, 0x100000
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s3, s2, 15
+; GFX8-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX8-NEXT:    s_bfe_u32 s4, 1, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s2, s3, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s3, s2, 15
+; GFX9-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX9-NEXT:    s_bfe_u32 s4, 1, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_bfe_u32 s2, s3, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b32 s3, s2, 15
+; GFX10-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX10-NEXT:    s_bfe_u32 s4, 1, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt)
+  ret i16 %result
+}
+
+define amdgpu_ps i16 @s_fshr_i16_4(i16 inreg %lhs, i16 inreg %rhs) {
+; GFX6-LABEL: s_fshr_i16_4:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 12
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 4
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_i16_4:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_bfe_u32 s2, 12, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s2, 4, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_i16_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_bfe_u32 s2, 12, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_bfe_u32 s2, 4, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_i16_4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_bfe_u32 s2, 12, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s3, 4, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 4)
+  ret i16 %result
+}
+
+define amdgpu_ps i16 @s_fshr_i16_5(i16 inreg %lhs, i16 inreg %rhs) {
+; GFX6-LABEL: s_fshr_i16_5:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 11
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 5
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_i16_5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_bfe_u32 s2, 11, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s2, 5, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_i16_5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_bfe_u32 s2, 11, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_bfe_u32 s2, 5, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_i16_5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_bfe_u32 s2, 11, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s3, 5, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 5)
+  ret i16 %result
+}
+
+define i16 @v_fshr_i16(i16 %lhs, i16 %rhs, i16 %amt) {
+; GFX6-LABEL: v_fshr_i16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v3, 15, v2
+; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_bfe_u32 v2, v3, 0, 16
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v3, 15, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v3, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 15, v2
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
+; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v3, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, v2, v1
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, v3, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt)
+  ret i16 %result
+}
+
+define i16 @v_fshr_i16_4(i16 %lhs, i16 %rhs) {
+; GFX6-LABEL: v_fshr_i16_4:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 12, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 4, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_i16_4:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 12, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 4, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_i16_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 12, v0
+; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 4, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_i16_4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, 12, v0
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, 4, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 4)
+  ret i16 %result
+}
+
+define i16 @v_fshr_i16_5(i16 %lhs, i16 %rhs) {
+; GFX6-LABEL: v_fshr_i16_5:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 11, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 5, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_i16_5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 11, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 5, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_i16_5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 11, v0
+; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 5, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_i16_5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, 11, v0
+; GFX10-NEXT:    v_lshrrev_b16_e64 v1, 5, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 5)
+  ret i16 %result
+}
+
+define amdgpu_ps half @v_fshr_i16_ssv(i16 inreg %lhs, i16 inreg %rhs, i16 %amt) {
+; GFX6-LABEL: v_fshr_i16_ssv:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_and_b32_e32 v1, 15, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
+; GFX6-NEXT:    v_lshl_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
+; GFX6-NEXT:    s_and_b32 s0, s1, 0xffff
+; GFX6-NEXT:    v_lshr_b32_e32 v1, s0, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshr_i16_ssv:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_and_b32_e32 v1, 15, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    s_bfe_u32 s2, 1, 0x100000
+; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    v_lshlrev_b16_e64 v0, v0, s0
+; GFX8-NEXT:    v_lshrrev_b16_e64 v1, v1, s1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshr_i16_ssv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_and_b32_e32 v1, 15, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX9-NEXT:    s_bfe_u32 s2, 1, 0x100000
+; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX9-NEXT:    v_lshlrev_b16_e64 v0, v0, s0
+; GFX9-NEXT:    v_lshrrev_b16_e64 v1, v1, s1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshr_i16_ssv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX10-NEXT:    s_bfe_u32 s2, 1, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX10-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX10-NEXT:    v_lshrrev_b16_e64 v0, v0, s1
+; GFX10-NEXT:    v_lshlrev_b16_e64 v1, v1, s0
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt)
+  %cast.result = bitcast i16 %result to half
+  ret half %cast.result
+}
+
+define amdgpu_ps half @v_fshr_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt) {
+; GFX6-LABEL: v_fshr_i16_svs:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s2, s1, 15
+; GFX6-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX6-NEXT:    s_bfe_u32 s1, s2, 0x100000
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s1, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshr_i16_svs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s2, s1, 15
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX8-NEXT:    s_bfe_u32 s3, 1, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v0, s2, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshr_i16_svs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s2, s1, 15
+; GFX9-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX9-NEXT:    s_bfe_u32 s3, 1, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX9-NEXT:    v_lshrrev_b16_e32 v0, s2, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshr_i16_svs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b32 s2, s1, 15
+; GFX10-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX10-NEXT:    s_bfe_u32 s3, 1, 0x100000
+; GFX10-NEXT:    v_lshrrev_b16_e64 v0, s2, v0
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt)
+  %cast.result = bitcast i16 %result to half
+  ret half %cast.result
+}
+
+define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt) {
+; GFX6-LABEL: v_fshr_i16_vss:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s2, s1, 15
+; GFX6-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s1, v0
+; GFX6-NEXT:    s_bfe_u32 s1, s2, 0x100000
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshr_i16_vss:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s2, s1, 15
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, s1, v0
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshr_i16_vss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s2, s1, 15
+; GFX9-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, s1, v0
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX9-NEXT:    s_bfe_u32 s1, s2, 0x100000
+; GFX9-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshr_i16_vss:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, 1, v0
+; GFX10-NEXT:    s_andn2_b32 s2, 15, s1
+; GFX10-NEXT:    s_and_b32 s1, s1, 15
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX10-NEXT:    v_lshlrev_b16_e64 v0, s2, v0
+; GFX10-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt)
+  %cast.result = bitcast i16 %result to half
+  ret half %cast.result
+}
+
+define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) {
+; GFX6-LABEL: s_fshr_v2i16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_mov_b32 s5, 0xffff
+; GFX6-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX6-NEXT:    s_bfe_u32 s4, 1, 0x100000
+; GFX6-NEXT:    s_and_b32 s6, s1, s5
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX6-NEXT:    s_lshl_b32 s3, s3, s4
+; GFX6-NEXT:    s_bfe_u32 s7, 14, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s4, s1, 17
+; GFX6-NEXT:    s_lshr_b32 s6, s6, 1
+; GFX6-NEXT:    s_lshr_b32 s4, s4, s7
+; GFX6-NEXT:    s_lshr_b32 s6, s6, s7
+; GFX6-NEXT:    s_or_b32 s3, s3, s4
+; GFX6-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX6-NEXT:    s_xor_b32 s2, s2, -1
+; GFX6-NEXT:    s_and_b32 s7, s2, 15
+; GFX6-NEXT:    s_and_b32 s1, s1, s5
+; GFX6-NEXT:    s_or_b32 s0, s0, s6
+; GFX6-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX6-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX6-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s7
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_and_b32 s1, s6, 15
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX6-NEXT:    s_andn2_b32 s2, 15, s6
+; GFX6-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX6-NEXT:    s_and_b32 s3, s4, s5
+; GFX6-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX6-NEXT:    s_or_b32 s1, s1, s2
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_v2i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_bfe_u32 s5, 1, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s6, s1, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s6, s6, s5
+; GFX8-NEXT:    s_bfe_u32 s7, 14, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s5
+; GFX8-NEXT:    s_lshr_b32 s6, s6, s7
+; GFX8-NEXT:    s_or_b32 s0, s0, s6
+; GFX8-NEXT:    s_lshr_b32 s6, s4, s5
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s5
+; GFX8-NEXT:    s_lshl_b32 s3, s3, s5
+; GFX8-NEXT:    s_lshr_b32 s6, s6, s7
+; GFX8-NEXT:    s_xor_b32 s2, s2, -1
+; GFX8-NEXT:    s_and_b32 s7, s2, 15
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_or_b32 s3, s3, s6
+; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX8-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX8-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s5
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s7
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s6, 15
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s4, s4, s5
+; GFX8-NEXT:    s_andn2_b32 s2, 15, s6
+; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX8-NEXT:    s_bfe_u32 s3, s4, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s3, s3, s5
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000f
+; GFX9-NEXT:    s_and_b32 s4, s2, s3
+; GFX9-NEXT:    s_andn2_b32 s2, s3, s2
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x10001
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX9-NEXT:    s_lshl_b32 s2, s3, s5
+; GFX9-NEXT:    s_mov_b32 s3, 0xffff
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX9-NEXT:    s_and_b32 s1, s1, s3
+; GFX9-NEXT:    s_lshr_b32 s5, s4, 16
+; GFX9-NEXT:    s_and_b32 s3, s4, s3
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX9-NEXT:    s_lshr_b32 s2, s2, s5
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_v2i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX10-NEXT:    s_mov_b32 s3, 0xf000f
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x10001
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX10-NEXT:    s_and_b32 s5, s2, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
+; GFX10-NEXT:    s_andn2_b32 s2, s3, s2
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX10-NEXT:    s_lshl_b32 s2, s3, s4
+; GFX10-NEXT:    s_mov_b32 s3, 0xffff
+; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX10-NEXT:    s_and_b32 s1, s1, s3
+; GFX10-NEXT:    s_and_b32 s3, s5, s3
+; GFX10-NEXT:    s_lshr_b32 s5, s5, 16
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX10-NEXT:    s_lshr_b32 s3, s4, s5
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt)
+  %cast = bitcast <2 x i16> %result to i32
+  ret i32 %cast
+}
+
+define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
+; GFX6-LABEL: v_fshr_v2i16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s5, 0xffff
+; GFX6-NEXT:    v_and_b32_e32 v4, s5, v1
+; GFX6-NEXT:    s_bfe_u32 s4, 1, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    s_bfe_u32 s6, 14, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, s6, v4
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 17, v1
+; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, s4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, s6, v4
+; GFX6-NEXT:    v_and_b32_e32 v6, 15, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, s5, v1
+; GFX6-NEXT:    v_bfe_u32 v6, v6, 0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 15, v5
+; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v5
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v1, v3
+; GFX6-NEXT:    v_and_b32_e32 v3, s5, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_v2i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 1, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 14, v4
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, 1
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v5, 14, v5
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_lshlrev_b16_e32 v5, 1, v1
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_and_b32_e32 v6, 15, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT:    v_lshrrev_b16_e32 v5, 1, v5
+; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v6, v3
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v2, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, 15, v4
+; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v4, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xf000f
+; GFX9-NEXT:    v_and_b32_e32 v3, s4, v2
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
+; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_v2i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
+; GFX10-NEXT:    s_mov_b32 s4, 0xf000f
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt)
+  ret <2 x i16> %result
+}
+
+define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
+; GFX6-LABEL: v_fshr_v2i16_4_8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_sub_i32 s4, 0, 4
+; GFX6-NEXT:    s_and_b32 s6, s4, 15
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; GFX6-NEXT:    s_xor_b32 s4, s4, -1
+; GFX6-NEXT:    s_sub_i32 s5, 0, 8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, s4, v3
+; GFX6-NEXT:    s_and_b32 s4, s5, 15
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT:    s_xor_b32 s5, s5, -1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, s4, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 17, v1
+; GFX6-NEXT:    s_bfe_u32 s4, s5, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s4, v1
+; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_v2i16_4_8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_sub_i32 s4, 0, 4
+; GFX8-NEXT:    s_and_b32 s6, s4, 15
+; GFX8-NEXT:    s_sub_i32 s5, 0, 8
+; GFX8-NEXT:    s_xor_b32 s4, s4, -1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 1, v1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v3, s4, v3
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, s6, v0
+; GFX8-NEXT:    s_and_b32 s4, s5, 15
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, s4
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 1
+; GFX8-NEXT:    s_xor_b32 s5, s5, -1
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, s5, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_v2i16_4_8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, 16
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, 16
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX9-NEXT:    s_sub_i32 s4, 0, 16
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, s4, v2
+; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v3
+; GFX9-NEXT:    v_mul_hi_u32 v4, v2, v4
+; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
+; GFX9-NEXT:    v_mul_hi_u32 v2, 4, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
+; GFX9-NEXT:    v_mul_hi_u32 v3, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX9-NEXT:    v_sub_u32_e32 v2, 4, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v4, 16, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 16, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, 16, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 16, v2
+; GFX9-NEXT:    v_sub_u32_e32 v3, 8, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v4, 16, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 16, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v4, 16, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 16, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v4, v3
+; GFX9-NEXT:    v_pk_sub_i16 v3, 16, v2 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
+; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_v2i16_4_8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v2, 16
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, 16
+; GFX10-NEXT:    s_sub_i32 s4, 0, 16
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX10-NEXT:    v_mul_lo_u32 v4, s4, v2
+; GFX10-NEXT:    v_mul_lo_u32 v5, s4, v3
+; GFX10-NEXT:    v_mul_hi_u32 v4, v2, v4
+; GFX10-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v5
+; GFX10-NEXT:    v_mul_hi_u32 v2, 8, v2
+; GFX10-NEXT:    v_mul_hi_u32 v3, 4, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 8, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 4, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v4, 16, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v4, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX10-NEXT:    v_and_or_b32 v2, v3, 0xffff, v2
+; GFX10-NEXT:    v_pk_sub_i16 v3, 16, v2 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> <i16 4, i16 8>)
+  ret <2 x i16> %result
+}
+
+define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> %amt) {
+; GFX6-LABEL: v_fshr_v2i16_ssv:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_mov_b32 s4, 0xffff
+; GFX6-NEXT:    s_and_b32 s5, s1, s4
+; GFX6-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX6-NEXT:    s_bfe_u32 s3, 1, 0x100000
+; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX6-NEXT:    s_lshl_b32 s2, s2, s3
+; GFX6-NEXT:    s_lshr_b32 s5, s5, 1
+; GFX6-NEXT:    s_bfe_u32 s6, 14, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s3, s1, 17
+; GFX6-NEXT:    v_and_b32_e32 v2, 15, v0
+; GFX6-NEXT:    s_lshr_b32 s5, s5, s6
+; GFX6-NEXT:    s_lshr_b32 s3, s3, s6
+; GFX6-NEXT:    s_or_b32 s2, s2, s3
+; GFX6-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX6-NEXT:    s_or_b32 s0, s0, s5
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX6-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
+; GFX6-NEXT:    s_and_b32 s0, s1, s4
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
+; GFX6-NEXT:    v_lshr_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, 15, v1
+; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX6-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX6-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX6-NEXT:    s_and_b32 s0, s3, s4
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
+; GFX6-NEXT:    v_lshl_b32_e32 v2, s2, v2
+; GFX6-NEXT:    v_lshr_b32_e32 v1, s0, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshr_v2i16_ssv:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_bfe_u32 s4, 1, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s5, s1, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s5, s5, s4
+; GFX8-NEXT:    s_bfe_u32 s6, 14, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX8-NEXT:    s_lshr_b32 s5, s5, s6
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s5
+; GFX8-NEXT:    v_and_b32_e32 v2, 15, v0
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    v_lshlrev_b16_e64 v2, v2, s0
+; GFX8-NEXT:    s_bfe_u32 s0, s1, 0x100000
+; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s4
+; GFX8-NEXT:    s_lshr_b32 s5, s3, s4
+; GFX8-NEXT:    v_lshrrev_b16_e64 v0, v0, s0
+; GFX8-NEXT:    s_lshl_b32 s3, s3, s4
+; GFX8-NEXT:    s_bfe_u32 s0, s3, 0x100000
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 15, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    s_lshl_b32 s2, s2, s4
+; GFX8-NEXT:    s_lshr_b32 s5, s5, s6
+; GFX8-NEXT:    s_or_b32 s2, s2, s5
+; GFX8-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s4
+; GFX8-NEXT:    v_lshlrev_b16_e64 v2, v2, s2
+; GFX8-NEXT:    v_lshrrev_b16_e64 v1, v1, s0
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshr_v2i16_ssv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s2, 0xf000f
+; GFX9-NEXT:    v_and_b32_e32 v1, s2, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x10001
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v0, s0
+; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v1, s1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshr_v2i16_ssv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v0
+; GFX10-NEXT:    s_mov_b32 s2, 0xf000f
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX10-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x10001
+; GFX10-NEXT:    v_and_b32_e32 v1, s2, v1
+; GFX10-NEXT:    s_lshl_b32 s2, s3, 1
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX10-NEXT:    v_pk_lshrrev_b16 v0, v0, s1
+; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v1, s0
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt)
+  %cast = bitcast <2 x i16> %result to float
+  ret float %cast
+}
+
+define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) {
+; GFX6-LABEL: v_fshr_v2i16_svs:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_mov_b32 s4, 0xffff
+; GFX6-NEXT:    v_and_b32_e32 v1, s4, v0
+; GFX6-NEXT:    s_bfe_u32 s3, 1, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    s_bfe_u32 s5, 14, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 17, v0
+; GFX6-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s5, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, s0, v1
+; GFX6-NEXT:    s_lshl_b32 s0, s2, s3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, s5, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_or_b32_e32 v2, s0, v2
+; GFX6-NEXT:    s_xor_b32 s0, s1, -1
+; GFX6-NEXT:    s_and_b32 s2, s0, 15
+; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX6-NEXT:    s_andn2_b32 s0, 15, s0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
+; GFX6-NEXT:    s_and_b32 s0, s1, 15
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s2, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s0, v2
+; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT:    s_bfe_u32 s0, s1, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, s0, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshr_v2i16_svs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 1
+; GFX8-NEXT:    s_bfe_u32 s3, 1, 0x100000
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v3, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 14, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, s0, v1
+; GFX8-NEXT:    s_lshl_b32 s0, s2, s3
+; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 14, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, s0, v3
+; GFX8-NEXT:    s_xor_b32 s0, s1, -1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v4, 1, v0
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    s_and_b32 s2, s0, 15
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX8-NEXT:    s_andn2_b32 s0, 15, s0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 1, v4
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, s0, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, s2, v1
+; GFX8-NEXT:    s_and_b32 s0, s1, 15
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, s0, v3
+; GFX8-NEXT:    v_lshrrev_b16_e32 v0, s1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshr_v2i16_svs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s2, 0xf000f
+; GFX9-NEXT:    s_and_b32 s3, s1, s2
+; GFX9-NEXT:    s_andn2_b32 s1, s2, s1
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x10001
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX9-NEXT:    s_lshl_b32 s1, s2, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX9-NEXT:    v_pk_lshrrev_b16 v0, s3, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshr_v2i16_svs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX10-NEXT:    s_mov_b32 s2, 0xf000f
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 0x10001
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX10-NEXT:    s_and_b32 s4, s1, s2
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
+; GFX10-NEXT:    s_andn2_b32 s1, s2, s1
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX10-NEXT:    v_pk_lshrrev_b16 v0, s4, v0
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX10-NEXT:    s_lshl_b32 s1, s2, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt)
+  %cast = bitcast <2 x i16> %result to float
+  ret float %cast
+}
+
+define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) {
+; GFX6-LABEL: v_fshr_v2i16_vss:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_mov_b32 s3, 0xffff
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    s_bfe_u32 s2, 1, 0x100000
+; GFX6-NEXT:    s_and_b32 s4, s0, s3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s2, v1
+; GFX6-NEXT:    s_bfe_u32 s5, 14, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s2, s0, 17
+; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX6-NEXT:    s_lshr_b32 s2, s2, s5
+; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
+; GFX6-NEXT:    v_or_b32_e32 v1, s2, v1
+; GFX6-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX6-NEXT:    s_xor_b32 s1, s1, -1
+; GFX6-NEXT:    s_and_b32 s5, s1, 15
+; GFX6-NEXT:    s_and_b32 s0, s0, s3
+; GFX6-NEXT:    v_or_b32_e32 v0, s4, v0
+; GFX6-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX6-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s5, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX6-NEXT:    s_and_b32 s0, s4, 15
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX6-NEXT:    s_andn2_b32 s1, 15, s4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
+; GFX6-NEXT:    s_and_b32 s0, s2, s3
+; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX6-NEXT:    v_or_b32_e32 v1, s0, v1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshr_v2i16_vss:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_bfe_u32 s3, s0, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s4, 1, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s3, s3, s4
+; GFX8-NEXT:    s_bfe_u32 s5, 14, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 1, v0
+; GFX8-NEXT:    s_lshr_b32 s3, s3, s5
+; GFX8-NEXT:    v_or_b32_e32 v1, s3, v1
+; GFX8-NEXT:    s_lshr_b32 s3, s2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, 1
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    s_lshr_b32 s3, s3, s5
+; GFX8-NEXT:    s_xor_b32 s1, s1, -1
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX8-NEXT:    v_or_b32_e32 v0, s3, v0
+; GFX8-NEXT:    s_and_b32 s5, s1, 15
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s4
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, s5, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, s0, v1
+; GFX8-NEXT:    s_and_b32 s0, s3, 15
+; GFX8-NEXT:    s_lshl_b32 s2, s2, s4
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s3
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, s0, v0
+; GFX8-NEXT:    s_bfe_u32 s0, s2, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s4
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshr_v2i16_vss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s2, 0xf000f
+; GFX9-NEXT:    s_and_b32 s3, s1, s2
+; GFX9-NEXT:    s_andn2_b32 s1, s2, s1
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s1, v0
+; GFX9-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX9-NEXT:    s_and_b32 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX9-NEXT:    s_and_b32 s2, s3, s2
+; GFX9-NEXT:    s_lshr_b32 s0, s0, s2
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshr_v2i16_vss:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
+; GFX10-NEXT:    s_mov_b32 s2, 0xf000f
+; GFX10-NEXT:    s_mov_b32 s3, 0xffff
+; GFX10-NEXT:    s_and_b32 s4, s1, s2
+; GFX10-NEXT:    s_andn2_b32 s1, s2, s1
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s1, v0
+; GFX10-NEXT:    s_and_b32 s0, s0, s3
+; GFX10-NEXT:    s_and_b32 s1, s4, s3
+; GFX10-NEXT:    s_lshr_b32 s3, s4, 16
+; GFX10-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX10-NEXT:    s_lshr_b32 s1, s2, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt)
+  %cast = bitcast <2 x i16> %result to float
+  ret float %cast
+}
+
+; ; FIXME
+; define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <3 x i16> inreg %amt) {
+;   %result = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
+;   %cast = bitcast <3 x i16> %result to i48
+;   ret i48 %cast
+; }
+
+; ; FIXME
+; define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) {
+;   %result = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
+;   %cast.result = bitcast <3 x i16> %result to <3 x half>
+;   ret <3 x half> %cast.result
+; }
+
+define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs, <4 x i16> inreg %amt) {
+; GFX6-LABEL: s_fshr_v4i16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_mov_b32 s12, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s9, s9, 16
+; GFX6-NEXT:    s_and_b32 s8, s8, s12
+; GFX6-NEXT:    s_or_b32 s8, s9, s8
+; GFX6-NEXT:    s_lshl_b32 s9, s11, 16
+; GFX6-NEXT:    s_and_b32 s11, s4, s12
+; GFX6-NEXT:    s_and_b32 s10, s10, s12
+; GFX6-NEXT:    s_or_b32 s9, s9, s10
+; GFX6-NEXT:    s_bfe_u32 s10, 1, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s11, s11, 1
+; GFX6-NEXT:    s_bfe_u32 s13, 14, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s10
+; GFX6-NEXT:    s_lshr_b32 s11, s11, s13
+; GFX6-NEXT:    s_or_b32 s0, s0, s11
+; GFX6-NEXT:    s_and_b32 s11, s5, s12
+; GFX6-NEXT:    s_lshr_b32 s11, s11, 1
+; GFX6-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX6-NEXT:    s_xor_b32 s8, s8, -1
+; GFX6-NEXT:    s_lshl_b32 s1, s1, s10
+; GFX6-NEXT:    s_lshr_b32 s11, s11, s13
+; GFX6-NEXT:    s_and_b32 s14, s8, 15
+; GFX6-NEXT:    s_and_b32 s4, s4, s12
+; GFX6-NEXT:    s_or_b32 s1, s1, s11
+; GFX6-NEXT:    s_lshr_b32 s11, s8, 16
+; GFX6-NEXT:    s_andn2_b32 s8, 15, s8
+; GFX6-NEXT:    s_bfe_u32 s14, s14, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s4, s4, s8
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s14
+; GFX6-NEXT:    s_or_b32 s0, s0, s4
+; GFX6-NEXT:    s_and_b32 s4, s11, 15
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX6-NEXT:    s_lshl_b32 s1, s1, s4
+; GFX6-NEXT:    s_and_b32 s4, s5, s12
+; GFX6-NEXT:    s_andn2_b32 s8, 15, s11
+; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX6-NEXT:    s_bfe_u32 s5, s8, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
+; GFX6-NEXT:    s_or_b32 s1, s1, s4
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s2, s10
+; GFX6-NEXT:    s_and_b32 s2, s6, s12
+; GFX6-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX6-NEXT:    s_lshr_b32 s2, s2, s13
+; GFX6-NEXT:    s_or_b32 s1, s1, s2
+; GFX6-NEXT:    s_lshl_b32 s2, s3, s10
+; GFX6-NEXT:    s_and_b32 s3, s7, s12
+; GFX6-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX6-NEXT:    s_lshr_b32 s3, s3, s13
+; GFX6-NEXT:    s_or_b32 s2, s2, s3
+; GFX6-NEXT:    s_lshl_b32 s3, s6, 1
+; GFX6-NEXT:    s_xor_b32 s5, s9, -1
+; GFX6-NEXT:    s_and_b32 s3, s3, s12
+; GFX6-NEXT:    s_lshl_b32 s4, s7, 1
+; GFX6-NEXT:    s_and_b32 s7, s5, 15
+; GFX6-NEXT:    s_lshr_b32 s6, s5, 16
+; GFX6-NEXT:    s_andn2_b32 s5, 15, s5
+; GFX6-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s3, s3, s5
+; GFX6-NEXT:    s_lshl_b32 s1, s1, s7
+; GFX6-NEXT:    s_or_b32 s1, s1, s3
+; GFX6-NEXT:    s_and_b32 s3, s6, 15
+; GFX6-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s2, s2, s3
+; GFX6-NEXT:    s_and_b32 s3, s4, s12
+; GFX6-NEXT:    s_andn2_b32 s5, 15, s6
+; GFX6-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX6-NEXT:    s_bfe_u32 s4, s5, 0x100000
+; GFX6-NEXT:    s_lshr_b32 s3, s3, s4
+; GFX6-NEXT:    s_or_b32 s2, s2, s3
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_or_b32 s1, s1, s2
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_v4i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_bfe_u32 s8, 1, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s9, s9, s8
+; GFX8-NEXT:    s_bfe_u32 s10, 14, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s7, s2, 16
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s8
+; GFX8-NEXT:    s_lshr_b32 s9, s9, s10
+; GFX8-NEXT:    s_or_b32 s0, s0, s9
+; GFX8-NEXT:    s_lshr_b32 s9, s7, s8
+; GFX8-NEXT:    s_lshl_b32 s2, s2, s8
+; GFX8-NEXT:    s_xor_b32 s4, s4, -1
+; GFX8-NEXT:    s_lshl_b32 s6, s6, s8
+; GFX8-NEXT:    s_lshr_b32 s9, s9, s10
+; GFX8-NEXT:    s_and_b32 s11, s4, 15
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT:    s_or_b32 s6, s6, s9
+; GFX8-NEXT:    s_lshr_b32 s9, s4, 16
+; GFX8-NEXT:    s_andn2_b32 s4, 15, s4
+; GFX8-NEXT:    s_bfe_u32 s11, s11, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s2, s2, s8
+; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s2, s2, s4
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s11
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, s9, 15
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s7, s7, s8
+; GFX8-NEXT:    s_andn2_b32 s4, 15, s9
+; GFX8-NEXT:    s_lshl_b32 s2, s6, s2
+; GFX8-NEXT:    s_bfe_u32 s6, s7, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s6, s6, s8
+; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s4, s6, s4
+; GFX8-NEXT:    s_or_b32 s2, s2, s4
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s6, s3, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s6, s6, s8
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s8
+; GFX8-NEXT:    s_lshr_b32 s6, s6, s10
+; GFX8-NEXT:    s_or_b32 s1, s1, s6
+; GFX8-NEXT:    s_lshr_b32 s6, s4, s8
+; GFX8-NEXT:    s_lshl_b32 s3, s3, s8
+; GFX8-NEXT:    s_xor_b32 s5, s5, -1
+; GFX8-NEXT:    s_lshl_b32 s2, s2, s8
+; GFX8-NEXT:    s_lshr_b32 s6, s6, s10
+; GFX8-NEXT:    s_and_b32 s7, s5, 15
+; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX8-NEXT:    s_or_b32 s2, s2, s6
+; GFX8-NEXT:    s_lshr_b32 s6, s5, 16
+; GFX8-NEXT:    s_andn2_b32 s5, 15, s5
+; GFX8-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s3, s3, s8
+; GFX8-NEXT:    s_bfe_u32 s5, s5, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s3, s3, s5
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s7
+; GFX8-NEXT:    s_or_b32 s1, s1, s3
+; GFX8-NEXT:    s_and_b32 s3, s6, 15
+; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s4, s4, s8
+; GFX8-NEXT:    s_lshl_b32 s2, s2, s3
+; GFX8-NEXT:    s_bfe_u32 s3, s4, 0x100000
+; GFX8-NEXT:    s_andn2_b32 s5, 15, s6
+; GFX8-NEXT:    s_lshr_b32 s3, s3, s8
+; GFX8-NEXT:    s_bfe_u32 s4, s5, 0x100000
+; GFX8-NEXT:    s_lshr_b32 s3, s3, s4
+; GFX8-NEXT:    s_or_b32 s2, s2, s3
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_v4i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s9, s0, 16
+; GFX9-NEXT:    s_mov_b32 s8, 0x10001
+; GFX9-NEXT:    s_mov_b32 s6, 0xf000f
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s8
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s9
+; GFX9-NEXT:    s_and_b32 s7, s4, s6
+; GFX9-NEXT:    s_andn2_b32 s4, s6, s4
+; GFX9-NEXT:    s_lshr_b32 s9, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s10, s4, 16
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX9-NEXT:    s_lshl_b32 s4, s9, s10
+; GFX9-NEXT:    s_mov_b32 s9, 0xffff
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX9-NEXT:    s_lshr_b32 s10, s7, 16
+; GFX9-NEXT:    s_and_b32 s2, s2, s9
+; GFX9-NEXT:    s_and_b32 s7, s7, s9
+; GFX9-NEXT:    s_lshr_b32 s2, s2, s7
+; GFX9-NEXT:    s_lshr_b32 s4, s4, s10
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s2, s5, s6
+; GFX9-NEXT:    s_andn2_b32 s4, s6, s5
+; GFX9-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX9-NEXT:    s_lshl_b32 s1, s1, s8
+; GFX9-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
+; GFX9-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s6, s4, 16
+; GFX9-NEXT:    s_lshl_b32 s1, s1, s4
+; GFX9-NEXT:    s_lshl_b32 s4, s5, s6
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX9-NEXT:    s_and_b32 s3, s3, s9
+; GFX9-NEXT:    s_and_b32 s2, s2, s9
+; GFX9-NEXT:    s_lshr_b32 s2, s3, s2
+; GFX9-NEXT:    s_lshr_b32 s3, s4, s5
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_v4i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s8, s0, 16
+; GFX10-NEXT:    s_mov_b32 s7, 0x10001
+; GFX10-NEXT:    s_mov_b32 s6, 0xf000f
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s7
+; GFX10-NEXT:    s_lshl_b32 s8, s8, 1
+; GFX10-NEXT:    s_and_b32 s9, s4, s6
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s8
+; GFX10-NEXT:    s_andn2_b32 s4, s6, s4
+; GFX10-NEXT:    s_lshr_b32 s8, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s10, s4, 16
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX10-NEXT:    s_lshl_b32 s4, s8, s10
+; GFX10-NEXT:    s_mov_b32 s8, 0xffff
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
+; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX10-NEXT:    s_lshl_b32 s1, s1, s7
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX10-NEXT:    s_and_b32 s7, s5, s6
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
+; GFX10-NEXT:    s_andn2_b32 s4, s6, s5
+; GFX10-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX10-NEXT:    s_lshr_b32 s6, s4, 16
+; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX10-NEXT:    s_and_b32 s11, s9, s8
+; GFX10-NEXT:    s_and_b32 s2, s2, s8
+; GFX10-NEXT:    s_lshr_b32 s9, s9, 16
+; GFX10-NEXT:    s_lshl_b32 s1, s1, s4
+; GFX10-NEXT:    s_lshl_b32 s4, s5, s6
+; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX10-NEXT:    s_and_b32 s6, s7, s8
+; GFX10-NEXT:    s_and_b32 s3, s3, s8
+; GFX10-NEXT:    s_lshr_b32 s7, s7, 16
+; GFX10-NEXT:    s_lshr_b32 s2, s2, s11
+; GFX10-NEXT:    s_lshr_b32 s9, s10, s9
+; GFX10-NEXT:    s_lshr_b32 s3, s3, s6
+; GFX10-NEXT:    s_lshr_b32 s5, s5, s7
+; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s9
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
+; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
+; GFX10-NEXT:    s_or_b32 s1, s1, s3
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
+  %cast.result = bitcast <4 x i16> %result to <2 x i32>
+  ret <2 x i32> %cast.result
+}
+
+define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) {
+; GFX6-LABEL: v_fshr_v4i16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v12, 0xffff
+; GFX6-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX6-NEXT:    v_and_b32_e32 v8, v8, v12
+; GFX6-NEXT:    v_or_b32_e32 v8, v9, v8
+; GFX6-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX6-NEXT:    v_and_b32_e32 v10, v10, v12
+; GFX6-NEXT:    s_mov_b32 s5, 0xffff
+; GFX6-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX6-NEXT:    v_and_b32_e32 v10, s5, v4
+; GFX6-NEXT:    s_bfe_u32 s4, 1, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 1, v10
+; GFX6-NEXT:    s_bfe_u32 s6, 14, 0x100000
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v10, s6, v10
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v10
+; GFX6-NEXT:    v_and_b32_e32 v10, s5, v5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 1, v10
+; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v10, s6, v10
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_and_b32_e32 v11, 15, v8
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX6-NEXT:    v_and_b32_e32 v8, 15, v8
+; GFX6-NEXT:    v_and_b32_e32 v4, s5, v4
+; GFX6-NEXT:    v_bfe_u32 v11, v11, 0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_bfe_u32 v8, v8, 0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v8, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v11, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, 15, v10
+; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v10
+; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 1, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
+; GFX6-NEXT:    v_and_b32_e32 v4, s5, v5
+; GFX6-NEXT:    v_and_b32_e32 v8, 15, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_bfe_u32 v5, v8, 0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v5, v4
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, s5, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, s4, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, s6, v4
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, s5, v7
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, s4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, s6, v4
+; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 1, v6
+; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v9
+; GFX6-NEXT:    v_and_b32_e32 v8, 15, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 1, v7
+; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v6
+; GFX6-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX6-NEXT:    v_and_b32_e32 v4, s5, v4
+; GFX6-NEXT:    v_bfe_u32 v8, v8, 0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_bfe_u32 v6, v6, 0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v6, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v8, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, 15, v7
+; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v7
+; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v4, v3
+; GFX6-NEXT:    v_and_b32_e32 v4, s5, v5
+; GFX6-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_bfe_u32 v5, v6, 0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v5, v4
+; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_v4i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 1, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 1, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 14, v7
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX8-NEXT:    v_mov_b32_e32 v7, 1
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v8, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v8, 14, v8
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_lshlrev_b16_e32 v8, 1, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 15, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
+; GFX8-NEXT:    v_lshrrev_b16_e32 v8, 1, v8
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v4, v8
+; GFX8-NEXT:    v_xor_b32_e32 v8, -1, v9
+; GFX8-NEXT:    v_lshlrev_b16_e32 v6, v10, v6
+; GFX8-NEXT:    v_or_b32_e32 v4, v6, v4
+; GFX8-NEXT:    v_and_b32_e32 v6, 15, v9
+; GFX8-NEXT:    v_and_b32_e32 v8, 15, v8
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 1, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v6, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v8, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 1, v3
+; GFX8-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b16_e32 v4, 1, v1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 14, v6
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v6, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v5
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 14, v6
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 1, v3
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_and_b32_e32 v8, 15, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 15, v5
+; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 1, v6
+; GFX8-NEXT:    v_lshrrev_b16_e32 v5, v5, v6
+; GFX8-NEXT:    v_lshlrev_b16_e32 v4, v8, v4
+; GFX8-NEXT:    v_xor_b32_e32 v6, -1, v7
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 15, v7
+; GFX8-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 1, v3
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, v5, v1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v3, v6, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_v4i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xf000f
+; GFX9-NEXT:    v_and_b32_e32 v6, s4, v4
+; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v4, v0
+; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v5
+; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v6, v2
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, s4, v5
+; GFX9-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v4, v1
+; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v2, v3
+; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_v4i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v4
+; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v5
+; GFX10-NEXT:    s_mov_b32 s4, 0xf000f
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX10-NEXT:    v_and_b32_e32 v6, s4, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, s4, v5
+; GFX10-NEXT:    v_and_b32_e32 v7, s4, v7
+; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_lshrrev_b16 v2, v4, v2
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v6, v0
+; GFX10-NEXT:    v_pk_lshrrev_b16 v3, v5, v3
+; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v7, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
+  %cast.result = bitcast <4 x i16> %result to <4 x half>
+  ret <4 x half> %cast.result
+}
+
+define amdgpu_ps i64 @s_fshr_i64(i64 inreg %lhs, i64 inreg %rhs, i64 inreg %amt) {
+; GFX6-LABEL: s_fshr_i64:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 63
+; GFX6-NEXT:    s_andn2_b64 s[4:5], 63, s[4:5]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s6
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 63
+; GFX8-NEXT:    s_andn2_b64 s[4:5], 63, s[4:5]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s6
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 63
+; GFX9-NEXT:    s_andn2_b64 s[4:5], 63, s[4:5]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s6
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_andn2_b64 s[6:7], 63, s[4:5]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX10-NEXT:    s_and_b64 s[4:5], s[4:5], 63
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s6
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt)
+  ret i64 %result
+}
+
+define amdgpu_ps i64 @s_fshr_i64_5(i64 inreg %lhs, i64 inreg %rhs) {
+; GCN-LABEL: s_fshr_i64_5:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_lshl_b32 s1, s0, 27
+; GCN-NEXT:    s_mov_b32 s0, 0
+; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 5
+; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    ; return to shader part epilog
+  %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 5)
+  ret i64 %result
+}
+
+define amdgpu_ps i64 @s_fshr_i64_32(i64 inreg %lhs, i64 inreg %rhs) {
+; GCN-LABEL: s_fshr_i64_32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s0, 0
+; GCN-NEXT:    s_mov_b32 s2, s3
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    ; return to shader part epilog
+  %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 32)
+  ret i64 %result
+}
+
+define amdgpu_ps i64 @s_fshr_i64_48(i64 inreg %lhs, i64 inreg %rhs) {
+; GCN-LABEL: s_fshr_i64_48:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_lshr_b32 s2, s3, 16
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
+; GCN-NEXT:    s_mov_b32 s3, 0
+; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    ; return to shader part epilog
+  %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 48)
+  ret i64 %result
+}
+
+define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
+; GFX6-LABEL: v_fshr_i64:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v5, 63, v4
+; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GFX6-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v5
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v5, 63, v4
+; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v5, 63, v4
+; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_xor_b32_e32 v5, -1, v4
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX10-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX10-NEXT:    v_and_b32_e32 v7, 63, v5
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt)
+  ret i64 %result
+}
+
+define i64 @v_fshr_i64_5(i64 %lhs, i64 %rhs) {
+; GFX6-LABEL: v_fshr_i64_5:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[2:3], 5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 27, v4
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_i64_5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 5, v[2:3]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 27, v4
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_i64_5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 5, v[2:3]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 27, v4
+; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_i64_5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 5, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 27, v4
+; GFX10-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 5)
+  ret i64 %result
+}
+
+define i64 @v_fshr_i64_32(i64 %lhs, i64 %rhs) {
+; GFX6-LABEL: v_fshr_i64_32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_i64_32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_i64_32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_i64_32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 32)
+  ret i64 %result
+}
+
+define i64 @v_fshr_i64_48(i64 %lhs, i64 %rhs) {
+; GFX6-LABEL: v_fshr_i64_48:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_i64_48:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_i64_48:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_i64_48:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 48)
+  ret i64 %result
+}
+
+define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 %amt) {
+; GFX6-LABEL: v_fshr_i64_ssv:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_and_b32_e32 v2, 63, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX6-NEXT:    v_lshl_b64 v[0:1], s[0:1], v0
+; GFX6-NEXT:    v_lshr_b64 v[2:3], s[2:3], v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshr_i64_ssv:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_and_b32_e32 v2, 63, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v0, s[0:1]
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v2, s[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshr_i64_ssv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_and_b32_e32 v2, 63, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v0, s[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v2, s[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshr_i64_ssv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX10-NEXT:    v_and_b32_e32 v2, 63, v1
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v0, s[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, s[0:1]
+; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt)
+  %cast = bitcast i64 %result to <2 x float>
+  ret <2 x float> %cast
+}
+
+define amdgpu_ps <2 x float> @v_fshr_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg %amt) {
+; GFX6-LABEL: v_fshr_i64_svs:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b64 s[4:5], s[2:3], 63
+; GFX6-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], s4
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, s1, v1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshr_i64_svs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b64 s[4:5], s[2:3], 63
+; GFX8-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, s1, v1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshr_i64_svs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b64 s[4:5], s[2:3], 63
+; GFX9-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, s1, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshr_i64_svs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b64 s[4:5], s[2:3], 63
+; GFX10-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, s1, v1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt)
+  %cast = bitcast i64 %result to <2 x float>
+  ret <2 x float> %cast
+}
+
+define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg %amt) {
+; GFX6-LABEL: v_fshr_i64_vss:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GFX6-NEXT:    s_and_b64 s[4:5], s[2:3], 63
+; GFX6-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s2
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
+; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, s1, v1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshr_i64_vss:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX8-NEXT:    s_and_b64 s[4:5], s[2:3], 63
+; GFX8-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s2, v[0:1]
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, s1, v1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshr_i64_vss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX9-NEXT:    s_and_b64 s[4:5], s[2:3], 63
+; GFX9-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s2, v[0:1]
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, s1, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshr_i64_vss:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX10-NEXT:    s_andn2_b64 s[4:5], 63, s[2:3]
+; GFX10-NEXT:    s_and_b64 s[2:3], s[2:3], 63
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, s1, v1
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt)
+  %cast = bitcast i64 %result to <2 x float>
+  ret <2 x float> %cast
+}
+
+define amdgpu_ps <2 x i64> @s_fshr_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs, <2 x i64> inreg %amt) {
+; GFX6-LABEL: s_fshr_v2i64:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b64 s[12:13], s[8:9], 63
+; GFX6-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX6-NEXT:    s_and_b64 s[4:5], s[10:11], 63
+; GFX6-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], s4
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_v2i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b64 s[12:13], s[8:9], 63
+; GFX8-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT:    s_and_b64 s[4:5], s[10:11], 63
+; GFX8-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[6:7], s4
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_v2i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b64 s[12:13], s[8:9], 63
+; GFX9-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT:    s_and_b64 s[4:5], s[10:11], 63
+; GFX9-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], s4
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_v2i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_andn2_b64 s[12:13], 63, s[8:9]
+; GFX10-NEXT:    s_and_b64 s[8:9], s[8:9], 63
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
+; GFX10-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
+; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX10-NEXT:    s_and_b64 s[10:11], s[10:11], 63
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
+; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s10
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt)
+  ret <2 x i64> %result
+}
+
+define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
+; GFX6-LABEL: v_fshr_v2i64:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v9, 63, v8
+; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GFX6-NEXT:    v_and_b32_e32 v8, 63, v8
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v8
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], v9
+; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v10
+; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX6-NEXT:    v_and_b32_e32 v4, 63, v10
+; GFX6-NEXT:    v_and_b32_e32 v8, 63, v8
+; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], v8
+; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v4
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX6-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_v2i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v9, 63, v8
+; GFX8-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v8, 63, v8
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v9, v[4:5]
+; GFX8-NEXT:    v_xor_b32_e32 v8, -1, v10
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, 63, v10
+; GFX8-NEXT:    v_and_b32_e32 v8, 63, v8
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v8, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v4, v[6:7]
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_v2i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v9, 63, v8
+; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v9, v[4:5]
+; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v10
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 63, v10
+; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v8, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v4, v[6:7]
+; GFX9-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX9-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX9-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_v2i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v8
+; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v10
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX10-NEXT:    v_and_b32_e32 v19, 63, v8
+; GFX10-NEXT:    v_and_b32_e32 v15, 63, v9
+; GFX10-NEXT:    v_and_b32_e32 v9, 63, v11
+; GFX10-NEXT:    v_and_b32_e32 v13, 63, v10
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v19, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[11:12], v15, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[15:16], v9, v[2:3]
+; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v13, v[6:7]
+; GFX10-NEXT:    v_or_b32_e32 v0, v11, v4
+; GFX10-NEXT:    v_or_b32_e32 v1, v12, v5
+; GFX10-NEXT:    v_or_b32_e32 v2, v15, v6
+; GFX10-NEXT:    v_or_b32_e32 v3, v16, v7
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt)
+  ret <2 x i64> %result
+}
+
+define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) {
+; GFX6-LABEL: s_fshr_i128:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_movk_i32 s10, 0x7f
+; GFX6-NEXT:    s_mov_b32 s11, 0
+; GFX6-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
+; GFX6-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
+; GFX6-NEXT:    s_sub_i32 s9, 1, 64
+; GFX6-NEXT:    s_sub_i32 s13, 64, 1
+; GFX6-NEXT:    s_cmp_lt_u32 1, 64
+; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 1, 0
+; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[14:15], s[0:1], s13
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[2:3], 1
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
+; GFX6-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX6-NEXT:    s_sub_i32 s13, s8, 64
+; GFX6-NEXT:    s_sub_i32 s9, 64, s8
+; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[10:11], s8
+; GFX6-NEXT:    s_lshr_b64 s[14:15], s[10:11], s9
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
+; GFX6-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[10:11], s13
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
+; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
+; GFX6-NEXT:    s_sub_i32 s14, s12, 64
+; GFX6-NEXT:    s_sub_i32 s13, 64, s12
+; GFX6-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[6:7], s12
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s12
+; GFX6-NEXT:    s_lshl_b64 s[12:13], s[6:7], s13
+; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
+; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[6:7]
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
+; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
+; GFX6-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_i128:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_movk_i32 s10, 0x7f
+; GFX8-NEXT:    s_mov_b32 s11, 0
+; GFX8-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
+; GFX8-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
+; GFX8-NEXT:    s_sub_i32 s9, 1, 64
+; GFX8-NEXT:    s_sub_i32 s13, 64, 1
+; GFX8-NEXT:    s_cmp_lt_u32 1, 64
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 1, 0
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[14:15], s[0:1], s13
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[2:3], 1
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
+; GFX8-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    s_sub_i32 s13, s8, 64
+; GFX8-NEXT:    s_sub_i32 s9, 64, s8
+; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[10:11], s8
+; GFX8-NEXT:    s_lshr_b64 s[14:15], s[10:11], s9
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
+; GFX8-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[10:11], s13
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
+; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
+; GFX8-NEXT:    s_sub_i32 s14, s12, 64
+; GFX8-NEXT:    s_sub_i32 s13, 64, s12
+; GFX8-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], s12
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s12
+; GFX8-NEXT:    s_lshl_b64 s[12:13], s[6:7], s13
+; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
+; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
+; GFX8-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_i128:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_movk_i32 s10, 0x7f
+; GFX9-NEXT:    s_mov_b32 s11, 0
+; GFX9-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
+; GFX9-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
+; GFX9-NEXT:    s_sub_i32 s9, 1, 64
+; GFX9-NEXT:    s_sub_i32 s13, 64, 1
+; GFX9-NEXT:    s_cmp_lt_u32 1, 64
+; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 1, 0
+; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[14:15], s[0:1], s13
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[2:3], 1
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
+; GFX9-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX9-NEXT:    s_sub_i32 s13, s8, 64
+; GFX9-NEXT:    s_sub_i32 s9, 64, s8
+; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[10:11], s8
+; GFX9-NEXT:    s_lshr_b64 s[14:15], s[10:11], s9
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
+; GFX9-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[10:11], s13
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
+; GFX9-NEXT:    s_sub_i32 s14, s12, 64
+; GFX9-NEXT:    s_sub_i32 s13, 64, s12
+; GFX9-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s12
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s12
+; GFX9-NEXT:    s_lshl_b64 s[12:13], s[6:7], s13
+; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
+; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
+; GFX9-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_i128:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_movk_i32 s10, 0x7f
+; GFX10-NEXT:    s_mov_b32 s11, 0
+; GFX10-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
+; GFX10-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
+; GFX10-NEXT:    s_sub_i32 s9, 1, 64
+; GFX10-NEXT:    s_sub_i32 s10, 64, 1
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s10
+; GFX10-NEXT:    s_lshl_b64 s[14:15], s[2:3], 1
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[0:1], 1
+; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_cselect_b64 s[14:15], s[16:17], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT:    s_sub_i32 s13, s8, 64
+; GFX10-NEXT:    s_sub_i32 s2, 64, s8
+; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[0:1], s8
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[14:15], s2
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[14:15], s8
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[14:15], s13
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[0:1], s[2:3]
+; GFX10-NEXT:    s_sub_i32 s14, s12, 64
+; GFX10-NEXT:    s_sub_i32 s10, 64, s12
+; GFX10-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], s12
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], s10
+; GFX10-NEXT:    s_lshr_b64 s[12:13], s[6:7], s12
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[12:13], 0
+; GFX10-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
+  ret i128 %result
+}
+
+define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
+; GFX6-LABEL: v_fshr_i128:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_movk_i32 s4, 0x7f
+; GFX6-NEXT:    v_and_b32_e32 v14, s4, v8
+; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX6-NEXT:    v_and_b32_e32 v15, s4, v8
+; GFX6-NEXT:    s_sub_i32 s5, 64, 1
+; GFX6-NEXT:    s_sub_i32 s4, 1, 64
+; GFX6-NEXT:    s_cmp_lt_u32 1, 64
+; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 1, 0
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[0:1], s5
+; GFX6-NEXT:    v_lshl_b64 v[10:11], v[2:3], 1
+; GFX6-NEXT:    v_lshl_b64 v[12:13], v[0:1], 1
+; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s4
+; GFX6-NEXT:    s_and_b32 s4, 1, s6
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX6-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX6-NEXT:    s_and_b32 s4, 1, s7
+; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v12, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v9, 0, v13, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v15
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[8:9], v2
+; GFX6-NEXT:    v_lshl_b64 v[10:11], v[0:1], v15
+; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, 64, v15
+; GFX6-NEXT:    v_lshl_b64 v[12:13], v[8:9], v15
+; GFX6-NEXT:    v_or_b32_e32 v10, v2, v10
+; GFX6-NEXT:    v_or_b32_e32 v11, v3, v11
+; GFX6-NEXT:    v_lshl_b64 v[2:3], v[8:9], v16
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX6-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v13, 0, v13, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX6-NEXT:    v_cndmask_b32_e32 v10, v2, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v11, v3, v1, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v14
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], v14
+; GFX6-NEXT:    v_lshl_b64 v[2:3], v[6:7], v2
+; GFX6-NEXT:    v_subrev_i32_e32 v15, vcc, 64, v14
+; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[6:7], v15
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[6:7], v14
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX6-NEXT:    v_or_b32_e32 v0, v12, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, v13, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, v10, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_i128:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_movk_i32 s4, 0x7f
+; GFX8-NEXT:    v_and_b32_e32 v14, s4, v8
+; GFX8-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX8-NEXT:    v_and_b32_e32 v15, s4, v8
+; GFX8-NEXT:    s_sub_i32 s5, 64, 1
+; GFX8-NEXT:    s_sub_i32 s4, 1, 64
+; GFX8-NEXT:    s_cmp_lt_u32 1, 64
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 1, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s5, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[10:11], 1, v[2:3]
+; GFX8-NEXT:    v_lshlrev_b64 v[12:13], 1, v[0:1]
+; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX8-NEXT:    s_and_b32 s4, 1, s6
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX8-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX8-NEXT:    s_and_b32 s4, 1, s7
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v12, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, 0, v13, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v15
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v2, v[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v15, v[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, 64, v15
+; GFX8-NEXT:    v_lshlrev_b64 v[12:13], v15, v[8:9]
+; GFX8-NEXT:    v_or_b32_e32 v10, v2, v10
+; GFX8-NEXT:    v_or_b32_e32 v11, v3, v11
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v16, v[8:9]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, 0, v13, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v2, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v3, v1, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v14
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v14, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
+; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, 64, v14
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v15, v[6:7]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v14, v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX8-NEXT:    v_or_b32_e32 v0, v12, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v13, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v10, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_i128:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0x7f
+; GFX9-NEXT:    v_and_b32_e32 v14, s4, v8
+; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX9-NEXT:    v_and_b32_e32 v15, s4, v8
+; GFX9-NEXT:    s_sub_i32 s5, 64, 1
+; GFX9-NEXT:    s_sub_i32 s4, 1, 64
+; GFX9-NEXT:    s_cmp_lt_u32 1, 64
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 1, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s5, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], 1, v[2:3]
+; GFX9-NEXT:    v_lshlrev_b64 v[12:13], 1, v[0:1]
+; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX9-NEXT:    s_and_b32 s4, 1, s6
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX9-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX9-NEXT:    s_and_b32 s4, 1, s7
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, 0, v13, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v15
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v2, v[8:9]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v15, v[0:1]
+; GFX9-NEXT:    v_subrev_u32_e32 v16, 64, v15
+; GFX9-NEXT:    v_lshlrev_b64 v[12:13], v15, v[8:9]
+; GFX9-NEXT:    v_or_b32_e32 v10, v2, v10
+; GFX9-NEXT:    v_or_b32_e32 v11, v3, v11
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v16, v[8:9]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, 0, v13, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v2, v0, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v3, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v14, v[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
+; GFX9-NEXT:    v_subrev_u32_e32 v15, 64, v14
+; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v15, v[6:7]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v14, v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX9-NEXT:    v_or_b32_e32 v0, v12, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, v13, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, v10, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_i128:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_sub_i32 s4, 64, 1
+; GFX10-NEXT:    s_sub_i32 s6, 1, 64
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    v_lshrrev_b64 v[9:10], s4, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[11:12], 1, v[2:3]
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[13:14], 1, v[0:1]
+; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX10-NEXT:    s_and_b32 s4, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s6, v[0:1]
+; GFX10-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s4
+; GFX10-NEXT:    v_or_b32_e32 v10, v10, v12
+; GFX10-NEXT:    v_xor_b32_e32 v15, -1, v8
+; GFX10-NEXT:    s_movk_i32 s5, 0x7f
+; GFX10-NEXT:    s_and_b32 s6, 1, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s4
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s6
+; GFX10-NEXT:    v_and_b32_e32 v19, s5, v15
+; GFX10-NEXT:    v_and_b32_e32 v20, s5, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, 0, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v11, 64, v19
+; GFX10-NEXT:    v_sub_nc_u32_e32 v17, 64, v20
+; GFX10-NEXT:    v_mov_b32_e32 v25, v4
+; GFX10-NEXT:    v_mov_b32_e32 v26, v5
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v19
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v11, v[9:10]
+; GFX10-NEXT:    v_lshlrev_b64 v[11:12], v19, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[13:14], v19, v[9:10]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v21, 64, v20
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v8, v[9:10]
+; GFX10-NEXT:    v_lshrrev_b64 v[15:16], v20, v[25:26]
+; GFX10-NEXT:    v_lshlrev_b64 v[17:18], v17, v[6:7]
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v19
+; GFX10-NEXT:    v_or_b32_e32 v10, v3, v12
+; GFX10-NEXT:    v_or_b32_e32 v11, v2, v11
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v21, v[6:7]
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, 0, v13, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v13, v15, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v9, v10, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v10, v16, v18
+; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v20, v[6:7]
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v8, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v13, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v15, v1, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v11, v0, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, v25, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, v26, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s4
+; GFX10-NEXT:    v_or_b32_e32 v0, v23, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, v10, v1
+; GFX10-NEXT:    v_or_b32_e32 v2, v8, v2
+; GFX10-NEXT:    v_or_b32_e32 v3, v9, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
+  ret i128 %result
+}
+
+define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) {
+; GFX6-LABEL: v_fshr_i128_ssv:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_sub_i32 s14, 1, 64
+; GFX6-NEXT:    s_sub_i32 s10, 64, 1
+; GFX6-NEXT:    s_cmp_lt_u32 1, 64
+; GFX6-NEXT:    s_movk_i32 s8, 0x7f
+; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 1, 0
+; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX6-NEXT:    v_and_b32_e32 v6, s8, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[0:1], s10
+; GFX6-NEXT:    s_lshl_b64 s[12:13], s[2:3], 1
+; GFX6-NEXT:    v_and_b32_e32 v7, s8, v0
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
+; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s14
+; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v7
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[8:9], v0
+; GFX6-NEXT:    v_lshl_b64 v[2:3], s[0:1], v7
+; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 64, v7
+; GFX6-NEXT:    v_lshl_b64 v[4:5], s[8:9], v7
+; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX6-NEXT:    v_lshl_b64 v[0:1], s[8:9], v8
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX6-NEXT:    v_mov_b32_e32 v3, s1
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v6
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[4:5], v6
+; GFX6-NEXT:    v_lshl_b64 v[2:3], s[6:7], v2
+; GFX6-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v6
+; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[6:7], v11
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX6-NEXT:    v_lshr_b64 v[4:5], s[6:7], v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX6-NEXT:    v_mov_b32_e32 v3, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; GFX6-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshr_i128_ssv:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_sub_i32 s14, 1, 64
+; GFX8-NEXT:    s_sub_i32 s10, 64, 1
+; GFX8-NEXT:    s_cmp_lt_u32 1, 64
+; GFX8-NEXT:    s_movk_i32 s8, 0x7f
+; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 1, 0
+; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX8-NEXT:    v_and_b32_e32 v6, s8, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[0:1], s10
+; GFX8-NEXT:    s_lshl_b64 s[12:13], s[2:3], 1
+; GFX8-NEXT:    v_and_b32_e32 v7, s8, v0
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
+; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s14
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v7
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, s[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v7, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 64, v7
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v7, s[8:9]
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, s[8:9]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v6
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v6, s[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, s[6:7]
+; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, 64, v6
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v11, s[6:7]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v6, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshr_i128_ssv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_sub_i32 s14, 1, 64
+; GFX9-NEXT:    s_sub_i32 s10, 64, 1
+; GFX9-NEXT:    s_cmp_lt_u32 1, 64
+; GFX9-NEXT:    s_movk_i32 s8, 0x7f
+; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 1, 0
+; GFX9-NEXT:    v_and_b32_e32 v6, s8, v0
+; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[0:1], s10
+; GFX9-NEXT:    s_lshl_b64 s[12:13], s[2:3], 1
+; GFX9-NEXT:    v_and_b32_e32 v7, s8, v0
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
+; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s14
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v7
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, s[8:9]
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, s[0:1]
+; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v7
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v7, s[8:9]
+; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, s[8:9]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v2, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v6, s[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, s[6:7]
+; GFX9-NEXT:    v_subrev_u32_e32 v11, 64, v6
+; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v11, s[6:7]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v6, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; GFX9-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshr_i128_ssv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v0
+; GFX10-NEXT:    s_sub_i32 s14, 1, 64
+; GFX10-NEXT:    s_sub_i32 s9, 64, 1
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    s_movk_i32 s8, 0x7f
+; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    v_and_b32_e32 v13, s8, v1
+; GFX10-NEXT:    v_and_b32_e32 v12, s8, v0
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[0:1], s9
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], 1
+; GFX10-NEXT:    s_lshl_b64 s[12:13], s[0:1], 1
+; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s14
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 64, v13
+; GFX10-NEXT:    s_cselect_b64 s[10:11], s[12:13], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 64, v12
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[2:3], s[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v0, s[10:11]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v13, s[8:9]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v13
+; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v12, s[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v8, s[6:7]
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v13
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 64, v12
+; GFX10-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, 64, v12
+; GFX10-NEXT:    v_lshlrev_b64 v[15:16], v10, s[10:11]
+; GFX10-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX10-NEXT:    v_or_b32_e32 v6, v6, v8
+; GFX10-NEXT:    v_or_b32_e32 v7, v7, v9
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v0, s[6:7]
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v13, s[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v15, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v16, v3, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v12, s[6:7]
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, 0, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v8, s8, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v10, s9, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
+; GFX10-NEXT:    v_or_b32_e32 v0, v11, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, v15, v1
+; GFX10-NEXT:    v_or_b32_e32 v2, v19, v2
+; GFX10-NEXT:    v_or_b32_e32 v3, v6, v3
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
+  %cast.result = bitcast i128 %result to <4 x float>
+  ret <4 x float> %cast.result
+}
+
+define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) {
+; GFX6-LABEL: v_fshr_i128_svs:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_movk_i32 s6, 0x7f
+; GFX6-NEXT:    s_mov_b32 s7, 0
+; GFX6-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
+; GFX6-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX6-NEXT:    s_sub_i32 s5, 1, 64
+; GFX6-NEXT:    s_sub_i32 s9, 64, 1
+; GFX6-NEXT:    s_cmp_lt_u32 1, 64
+; GFX6-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 1, 0
+; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[0:1], s9
+; GFX6-NEXT:    s_lshl_b64 s[12:13], s[2:3], 1
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
+; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX6-NEXT:    s_sub_i32 s9, s4, 64
+; GFX6-NEXT:    s_sub_i32 s5, 64, s4
+; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[6:7], s4
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[6:7], s5
+; GFX6-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
+; GFX6-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], s9
+; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX6-NEXT:    s_sub_i32 s5, 64, s8
+; GFX6-NEXT:    s_sub_i32 s4, s8, 64
+; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s8
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s5
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s8
+; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s4
+; GFX6-NEXT:    s_and_b32 s4, 1, s6
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX6-NEXT:    s_and_b32 s4, 1, s7
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    s_and_b32 s4, 1, s6
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX6-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, s3, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, s0, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, s1, v3
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshr_i128_svs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_movk_i32 s6, 0x7f
+; GFX8-NEXT:    s_mov_b32 s7, 0
+; GFX8-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
+; GFX8-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX8-NEXT:    s_sub_i32 s5, 1, 64
+; GFX8-NEXT:    s_sub_i32 s9, 64, 1
+; GFX8-NEXT:    s_cmp_lt_u32 1, 64
+; GFX8-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 1, 0
+; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[0:1], s9
+; GFX8-NEXT:    s_lshl_b64 s[12:13], s[2:3], 1
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
+; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
+; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    s_sub_i32 s9, s4, 64
+; GFX8-NEXT:    s_sub_i32 s5, 64, s4
+; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[6:7], s4
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[6:7], s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
+; GFX8-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], s9
+; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT:    s_sub_i32 s5, 64, s8
+; GFX8-NEXT:    s_sub_i32 s4, s8, 64
+; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s8, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s8, v[2:3]
+; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s4, v[2:3]
+; GFX8-NEXT:    s_and_b32 s4, 1, s6
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8-NEXT:    s_and_b32 s4, 1, s7
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    s_and_b32 s4, 1, s6
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, s3, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, s0, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, s1, v3
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshr_i128_svs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_movk_i32 s6, 0x7f
+; GFX9-NEXT:    s_mov_b32 s7, 0
+; GFX9-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
+; GFX9-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX9-NEXT:    s_sub_i32 s5, 1, 64
+; GFX9-NEXT:    s_sub_i32 s9, 64, 1
+; GFX9-NEXT:    s_cmp_lt_u32 1, 64
+; GFX9-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 1, 0
+; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[0:1], s9
+; GFX9-NEXT:    s_lshl_b64 s[12:13], s[2:3], 1
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
+; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
+; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX9-NEXT:    s_sub_i32 s9, s4, 64
+; GFX9-NEXT:    s_sub_i32 s5, 64, s4
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[6:7], s4
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[6:7], s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
+; GFX9-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], s9
+; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT:    s_sub_i32 s5, 64, s8
+; GFX9-NEXT:    s_sub_i32 s4, s8, 64
+; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s8, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s8, v[2:3]
+; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s4, v[2:3]
+; GFX9-NEXT:    s_and_b32 s4, 1, s6
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9-NEXT:    s_and_b32 s4, 1, s7
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    s_and_b32 s4, 1, s6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, s3, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, s0, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, s1, v3
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshr_i128_svs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_movk_i32 s6, 0x7f
+; GFX10-NEXT:    s_mov_b32 s7, 0
+; GFX10-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
+; GFX10-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-NEXT:    s_sub_i32 s5, 1, 64
+; GFX10-NEXT:    s_sub_i32 s6, 64, 1
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s8, v[0:1]
+; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], 1
+; GFX10-NEXT:    s_lshl_b64 s[12:13], s[0:1], 1
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX10-NEXT:    s_cselect_b64 s[10:11], s[12:13], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT:    s_sub_i32 s9, s4, 64
+; GFX10-NEXT:    s_sub_i32 s2, 64, s4
+; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_lshl_b64 s[6:7], s[0:1], s4
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[10:11], s2
+; GFX10-NEXT:    s_lshl_b64 s[4:5], s[10:11], s4
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX10-NEXT:    s_lshl_b64 s[6:7], s[10:11], s9
+; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[6:7]
+; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[0:1], s[2:3]
+; GFX10-NEXT:    s_sub_i32 s0, 64, s8
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
+; GFX10-NEXT:    s_sub_i32 s0, s8, 64
+; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX10-NEXT:    v_lshrrev_b64 v[11:12], s0, v[2:3]
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10-NEXT:    s_and_b32 s1, 1, vcc_lo
+; GFX10-NEXT:    s_and_b32 s0, 1, s0
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s8, v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v12, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
+; GFX10-NEXT:    v_or_b32_e32 v0, s4, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, s5, v1
+; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
+  %cast.result = bitcast i128 %result to <4 x float>
+  ret <4 x float> %cast.result
+}
+
+define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
+; GFX6-LABEL: v_fshr_i128_vss:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_movk_i32 s6, 0x7f
+; GFX6-NEXT:    s_mov_b32 s7, 0
+; GFX6-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
+; GFX6-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX6-NEXT:    s_sub_i32 s6, 64, 1
+; GFX6-NEXT:    s_sub_i32 s5, 1, 64
+; GFX6-NEXT:    s_cmp_lt_u32 1, 64
+; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 1, 0
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s6
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], 1
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    v_lshl_b64 v[8:9], v[0:1], 1
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s5
+; GFX6-NEXT:    s_and_b32 s5, 1, s7
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX6-NEXT:    v_or_b32_e32 v6, v4, v6
+; GFX6-NEXT:    v_or_b32_e32 v7, v5, v7
+; GFX6-NEXT:    s_and_b32 s5, 1, s9
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX6-NEXT:    s_sub_i32 s5, s4, 64
+; GFX6-NEXT:    s_sub_i32 s6, 64, s4
+; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[4:5], s6
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[0:1], s4
+; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], s4
+; GFX6-NEXT:    s_and_b32 s4, 1, s7
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    s_and_b32 s4, 1, s9
+; GFX6-NEXT:    s_sub_i32 s10, s8, 64
+; GFX6-NEXT:    s_sub_i32 s9, 64, s8
+; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX6-NEXT:    v_or_b32_e32 v6, v2, v6
+; GFX6-NEXT:    v_or_b32_e32 v7, v3, v7
+; GFX6-NEXT:    v_lshl_b64 v[2:3], v[4:5], s5
+; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[2:3], s8
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s9
+; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
+; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
+; GFX6-NEXT:    v_or_b32_e32 v0, s0, v4
+; GFX6-NEXT:    v_or_b32_e32 v1, s1, v5
+; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: v_fshr_i128_vss:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_movk_i32 s6, 0x7f
+; GFX8-NEXT:    s_mov_b32 s7, 0
+; GFX8-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
+; GFX8-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX8-NEXT:    s_sub_i32 s6, 64, 1
+; GFX8-NEXT:    s_sub_i32 s5, 1, 64
+; GFX8-NEXT:    s_cmp_lt_u32 1, 64
+; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 1, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], 1, v[2:3]
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    v_lshlrev_b64 v[8:9], 1, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
+; GFX8-NEXT:    s_and_b32 s5, 1, s7
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX8-NEXT:    v_or_b32_e32 v6, v4, v6
+; GFX8-NEXT:    v_or_b32_e32 v7, v5, v7
+; GFX8-NEXT:    s_and_b32 s5, 1, s9
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX8-NEXT:    s_sub_i32 s5, s4, 64
+; GFX8-NEXT:    s_sub_i32 s6, 64, s4
+; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s6, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s4, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s4, v[4:5]
+; GFX8-NEXT:    s_and_b32 s4, 1, s7
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    s_and_b32 s4, 1, s9
+; GFX8-NEXT:    s_sub_i32 s10, s8, 64
+; GFX8-NEXT:    s_sub_i32 s9, 64, s8
+; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX8-NEXT:    v_or_b32_e32 v6, v2, v6
+; GFX8-NEXT:    v_or_b32_e32 v7, v3, v7
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], s5, v[4:5]
+; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[2:3], s8
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s9
+; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
+; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v4
+; GFX8-NEXT:    v_or_b32_e32 v1, s1, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v_fshr_i128_vss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_movk_i32 s6, 0x7f
+; GFX9-NEXT:    s_mov_b32 s7, 0
+; GFX9-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
+; GFX9-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX9-NEXT:    s_sub_i32 s6, 64, 1
+; GFX9-NEXT:    s_sub_i32 s5, 1, 64
+; GFX9-NEXT:    s_cmp_lt_u32 1, 64
+; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 1, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], 1, v[2:3]
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    v_lshlrev_b64 v[8:9], 1, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
+; GFX9-NEXT:    s_and_b32 s5, 1, s7
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX9-NEXT:    v_or_b32_e32 v6, v4, v6
+; GFX9-NEXT:    v_or_b32_e32 v7, v5, v7
+; GFX9-NEXT:    s_and_b32 s5, 1, s9
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX9-NEXT:    s_sub_i32 s5, s4, 64
+; GFX9-NEXT:    s_sub_i32 s6, 64, s4
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s6, v[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s4, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s4, v[4:5]
+; GFX9-NEXT:    s_and_b32 s4, 1, s7
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    s_and_b32 s4, 1, s9
+; GFX9-NEXT:    s_sub_i32 s10, s8, 64
+; GFX9-NEXT:    s_sub_i32 s9, 64, s8
+; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX9-NEXT:    v_or_b32_e32 v6, v2, v6
+; GFX9-NEXT:    v_or_b32_e32 v7, v3, v7
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], s5, v[4:5]
+; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[2:3], s8
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s9
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
+; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v4
+; GFX9-NEXT:    v_or_b32_e32 v1, s1, v5
+; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: v_fshr_i128_vss:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_movk_i32 s6, 0x7f
+; GFX10-NEXT:    s_mov_b32 s7, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], 1, v[2:3]
+; GFX10-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
+; GFX10-NEXT:    s_andn2_b64 s[6:7], s[6:7], s[4:5]
+; GFX10-NEXT:    s_sub_i32 s4, 64, 1
+; GFX10-NEXT:    s_sub_i32 s5, 1, 64
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    v_lshlrev_b64 v[13:14], s5, v[0:1]
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], 1, v[0:1]
+; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX10-NEXT:    s_and_b32 s4, 1, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v6, v4, v6
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s4
+; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX10-NEXT:    s_and_b32 s5, 1, s7
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v13, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v14, v5, s4
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc_lo
+; GFX10-NEXT:    s_sub_i32 s5, s6, 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s4
+; GFX10-NEXT:    s_sub_i32 s4, 64, s6
+; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX10-NEXT:    v_lshrrev_b64 v[11:12], s4, v[4:5]
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s6, v[0:1]
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s6, v[4:5]
+; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-NEXT:    s_and_b32 s4, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], s5, v[4:5]
+; GFX10-NEXT:    v_or_b32_e32 v2, v11, v6
+; GFX10-NEXT:    v_or_b32_e32 v3, v12, v7
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s4
+; GFX10-NEXT:    s_sub_i32 s10, s8, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s4
+; GFX10-NEXT:    s_and_b32 s4, 1, s6
+; GFX10-NEXT:    s_sub_i32 s6, 64, s8
+; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[0:1], s8
+; GFX10-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[2:3], s8
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v6
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[8:9], 0
+; GFX10-NEXT:    v_or_b32_e32 v1, s1, v7
+; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
+  %cast.result = bitcast i128 %result to <4 x float>
+  ret <4 x float> %cast.result
+}
+
+define amdgpu_ps i128 @s_fshr_i128_65(i128 inreg %lhs, i128 inreg %rhs) {
+; GFX6-LABEL: s_fshr_i128_65:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_sub_i32 s14, 63, 64
+; GFX6-NEXT:    s_sub_i32 s10, 64, 63
+; GFX6-NEXT:    s_cmp_lt_u32 63, 64
+; GFX6-NEXT:    s_mov_b32 s8, 0
+; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 63, 0
+; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[0:1], s10
+; GFX6-NEXT:    s_lshl_b32 s13, s2, 31
+; GFX6-NEXT:    s_mov_b32 s12, s8
+; GFX6-NEXT:    s_lshl_b32 s9, s0, 31
+; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s14
+; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_movk_i32 s10, 0x41
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX6-NEXT:    s_sub_i32 s14, s10, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s10
+; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
+; GFX6-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
+; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[6:7]
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
+; GFX6-NEXT:    s_or_b64 s[0:1], s[8:9], s[4:5]
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_i128_65:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_sub_i32 s14, 63, 64
+; GFX8-NEXT:    s_sub_i32 s10, 64, 63
+; GFX8-NEXT:    s_cmp_lt_u32 63, 64
+; GFX8-NEXT:    s_mov_b32 s8, 0
+; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 63, 0
+; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[0:1], s10
+; GFX8-NEXT:    s_lshl_b32 s13, s2, 31
+; GFX8-NEXT:    s_mov_b32 s12, s8
+; GFX8-NEXT:    s_lshl_b32 s9, s0, 31
+; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s14
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX8-NEXT:    s_movk_i32 s10, 0x41
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX8-NEXT:    s_sub_i32 s14, s10, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s10
+; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
+; GFX8-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
+; GFX8-NEXT:    s_or_b64 s[0:1], s[8:9], s[4:5]
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_i128_65:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_sub_i32 s14, 63, 64
+; GFX9-NEXT:    s_sub_i32 s10, 64, 63
+; GFX9-NEXT:    s_cmp_lt_u32 63, 64
+; GFX9-NEXT:    s_mov_b32 s8, 0
+; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 63, 0
+; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[0:1], s10
+; GFX9-NEXT:    s_lshl_b32 s13, s2, 31
+; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_lshl_b32 s9, s0, 31
+; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s14
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX9-NEXT:    s_movk_i32 s10, 0x41
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    s_sub_i32 s14, s10, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s10
+; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
+; GFX9-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
+; GFX9-NEXT:    s_or_b64 s[0:1], s[8:9], s[4:5]
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_i128_65:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_sub_i32 s14, 63, 64
+; GFX10-NEXT:    s_sub_i32 s9, 64, 63
+; GFX10-NEXT:    s_cmp_lt_u32 63, 64
+; GFX10-NEXT:    s_mov_b32 s8, 0
+; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 63, 0
+; GFX10-NEXT:    s_mov_b32 s12, s8
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s9
+; GFX10-NEXT:    s_lshl_b32 s13, s2, 31
+; GFX10-NEXT:    s_lshl_b32 s9, s0, 31
+; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s14
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_movk_i32 s12, 0x41
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX10-NEXT:    s_sub_i32 s14, s12, 64
+; GFX10-NEXT:    s_sub_i32 s10, 64, s12
+; GFX10-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], s12
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], s10
+; GFX10-NEXT:    s_lshr_b64 s[12:13], s[6:7], s12
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[12:13], 0
+; GFX10-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 65)
+  ret i128 %result
+}
+
+define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
+; GFX6-LABEL: v_fshr_i128_65:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_sub_i32 s5, 64, 63
+; GFX6-NEXT:    s_sub_i32 s4, 63, 64
+; GFX6-NEXT:    s_cmp_lt_u32 63, 64
+; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 63, 0
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[0:1], s5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v10, 31, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v11, 31, v2
+; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s4
+; GFX6-NEXT:    s_and_b32 s4, 1, s6
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX6-NEXT:    s_and_b32 s4, 1, s7
+; GFX6-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    s_movk_i32 s4, 0x41
+; GFX6-NEXT:    s_sub_i32 s5, s4, 64
+; GFX6-NEXT:    s_sub_i32 s6, 64, s4
+; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX6-NEXT:    v_cndmask_b32_e32 v11, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v12, v1, v3, vcc
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], s4
+; GFX6-NEXT:    v_lshl_b64 v[2:3], v[6:7], s6
+; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[6:7], s4
+; GFX6-NEXT:    s_and_b32 s4, 1, s7
+; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[6:7], s5
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    s_and_b32 s4, 1, s8
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    s_and_b32 s4, 1, s7
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX6-NEXT:    v_or_b32_e32 v1, v10, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, v11, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v12, v3
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_i128_65:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_sub_i32 s5, 64, 63
+; GFX8-NEXT:    s_sub_i32 s4, 63, 64
+; GFX8-NEXT:    s_cmp_lt_u32 63, 64
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 63, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s5, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 31, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 31, v2
+; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX8-NEXT:    s_and_b32 s4, 1, s6
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX8-NEXT:    s_and_b32 s4, 1, s7
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    s_movk_i32 s4, 0x41
+; GFX8-NEXT:    s_sub_i32 s5, s4, 64
+; GFX8-NEXT:    s_sub_i32 s6, 64, s4
+; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v1, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s4, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], s6, v[6:7]
+; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s4, v[6:7]
+; GFX8-NEXT:    s_and_b32 s4, 1, s7
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s5, v[6:7]
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    s_and_b32 s4, 1, s8
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    s_and_b32 s4, 1, s7
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX8-NEXT:    v_or_b32_e32 v1, v10, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v11, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v12, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_i128_65:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_sub_i32 s5, 64, 63
+; GFX9-NEXT:    s_sub_i32 s4, 63, 64
+; GFX9-NEXT:    s_cmp_lt_u32 63, 64
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 63, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s5, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 31, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 31, v2
+; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX9-NEXT:    s_and_b32 s4, 1, s6
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX9-NEXT:    s_and_b32 s4, 1, s7
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    s_movk_i32 s4, 0x41
+; GFX9-NEXT:    s_sub_i32 s5, s4, 64
+; GFX9-NEXT:    s_sub_i32 s6, 64, s4
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v1, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s4, v[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], s6, v[6:7]
+; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s4, v[6:7]
+; GFX9-NEXT:    s_and_b32 s4, 1, s7
+; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s5, v[6:7]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    s_and_b32 s4, 1, s8
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    s_and_b32 s4, 1, s7
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX9-NEXT:    v_or_b32_e32 v1, v10, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, v11, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v12, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_i128_65:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_sub_i32 s4, 64, 63
+; GFX10-NEXT:    s_sub_i32 s5, 63, 64
+; GFX10-NEXT:    s_cmp_lt_u32 63, 64
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 31, v0
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 63, 0
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s4, v[0:1]
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[14:15], s5, v[0:1]
+; GFX10-NEXT:    s_and_b32 s5, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 31, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, 0, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
+; GFX10-NEXT:    s_movk_i32 s6, 0x41
+; GFX10-NEXT:    s_and_b32 s4, 1, s4
+; GFX10-NEXT:    s_sub_i32 s5, 64, s6
+; GFX10-NEXT:    v_or_b32_e32 v12, v9, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v14, v8, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b64 v[10:11], s5, v[6:7]
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s6, v[4:5]
+; GFX10-NEXT:    s_sub_i32 s5, s6, 64
+; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v15, v12, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[15:16], s5, v[6:7]
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX10-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX10-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    v_lshrrev_b64 v[6:7], s6, v[6:7]
+; GFX10-NEXT:    s_and_b32 s5, 1, s5
+; GFX10-NEXT:    s_and_b32 s6, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v15, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, s5
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v19, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v12, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v15, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v11, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0, v7, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v1, v23, v1
+; GFX10-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX10-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 65)
+  ret i128 %result
+}
+
+define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
+; GFX6-LABEL: s_fshr_v2i128:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_movk_i32 s18, 0x7f
+; GFX6-NEXT:    s_mov_b32 s19, 0
+; GFX6-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
+; GFX6-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
+; GFX6-NEXT:    s_sub_i32 s30, 1, 64
+; GFX6-NEXT:    s_sub_i32 s31, 64, 1
+; GFX6-NEXT:    s_cmp_lt_u32 1, 64
+; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 1, 0
+; GFX6-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[26:27], s[0:1], s31
+; GFX6-NEXT:    s_lshl_b64 s[28:29], s[2:3], 1
+; GFX6-NEXT:    s_lshl_b64 s[24:25], s[0:1], 1
+; GFX6-NEXT:    s_or_b64 s[26:27], s[26:27], s[28:29]
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s30
+; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX6-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[26:27], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX6-NEXT:    s_sub_i32 s23, s16, 64
+; GFX6-NEXT:    s_sub_i32 s17, 64, s16
+; GFX6-NEXT:    s_cmp_lt_u32 s16, 64
+; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX6-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[24:25], s16
+; GFX6-NEXT:    s_lshr_b64 s[26:27], s[24:25], s17
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
+; GFX6-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
+; GFX6-NEXT:    s_lshl_b64 s[24:25], s[24:25], s23
+; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX6-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[24:25]
+; GFX6-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX6-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
+; GFX6-NEXT:    s_sub_i32 s26, s22, 64
+; GFX6-NEXT:    s_sub_i32 s24, 64, s22
+; GFX6-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s22, 0
+; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[10:11], s22
+; GFX6-NEXT:    s_lshl_b64 s[24:25], s[10:11], s24
+; GFX6-NEXT:    s_lshr_b64 s[22:23], s[8:9], s22
+; GFX6-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s26
+; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[22:23], s[10:11]
+; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
+; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
+; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX6-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
+; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
+; GFX6-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX6-NEXT:    s_cmp_lt_u32 1, 64
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 1, 0
+; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[18:19], s[4:5], s31
+; GFX6-NEXT:    s_lshl_b64 s[20:21], s[6:7], 1
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
+; GFX6-NEXT:    s_or_b64 s[18:19], s[18:19], s[20:21]
+; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s30
+; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX6-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[18:19], s[4:5]
+; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
+; GFX6-NEXT:    s_sub_i32 s9, s10, 64
+; GFX6-NEXT:    s_sub_i32 s11, 64, s10
+; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[16:17], s10
+; GFX6-NEXT:    s_lshr_b64 s[18:19], s[16:17], s11
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
+; GFX6-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[16:17], s9
+; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[16:17]
+; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[4:5], s[10:11]
+; GFX6-NEXT:    s_sub_i32 s18, s8, 64
+; GFX6-NEXT:    s_sub_i32 s16, 64, s8
+; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[14:15], s8
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
+; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[16:17]
+; GFX6-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[14:15]
+; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
+; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[8:9]
+; GFX6-NEXT:    s_or_b64 s[6:7], s[10:11], s[12:13]
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_v2i128:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_movk_i32 s18, 0x7f
+; GFX8-NEXT:    s_mov_b32 s19, 0
+; GFX8-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
+; GFX8-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
+; GFX8-NEXT:    s_sub_i32 s30, 1, 64
+; GFX8-NEXT:    s_sub_i32 s31, 64, 1
+; GFX8-NEXT:    s_cmp_lt_u32 1, 64
+; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 1, 0
+; GFX8-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[26:27], s[0:1], s31
+; GFX8-NEXT:    s_lshl_b64 s[28:29], s[2:3], 1
+; GFX8-NEXT:    s_lshl_b64 s[24:25], s[0:1], 1
+; GFX8-NEXT:    s_or_b64 s[26:27], s[26:27], s[28:29]
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s30
+; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX8-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[26:27], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    s_sub_i32 s23, s16, 64
+; GFX8-NEXT:    s_sub_i32 s17, 64, s16
+; GFX8-NEXT:    s_cmp_lt_u32 s16, 64
+; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[24:25], s16
+; GFX8-NEXT:    s_lshr_b64 s[26:27], s[24:25], s17
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
+; GFX8-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
+; GFX8-NEXT:    s_lshl_b64 s[24:25], s[24:25], s23
+; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[24:25]
+; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX8-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
+; GFX8-NEXT:    s_sub_i32 s26, s22, 64
+; GFX8-NEXT:    s_sub_i32 s24, 64, s22
+; GFX8-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s22, 0
+; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[10:11], s22
+; GFX8-NEXT:    s_lshl_b64 s[24:25], s[10:11], s24
+; GFX8-NEXT:    s_lshr_b64 s[22:23], s[8:9], s22
+; GFX8-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s26
+; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[22:23], s[10:11]
+; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
+; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
+; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX8-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
+; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
+; GFX8-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX8-NEXT:    s_cmp_lt_u32 1, 64
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 1, 0
+; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[18:19], s[4:5], s31
+; GFX8-NEXT:    s_lshl_b64 s[20:21], s[6:7], 1
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
+; GFX8-NEXT:    s_or_b64 s[18:19], s[18:19], s[20:21]
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], s30
+; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX8-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[18:19], s[4:5]
+; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
+; GFX8-NEXT:    s_sub_i32 s9, s10, 64
+; GFX8-NEXT:    s_sub_i32 s11, 64, s10
+; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[16:17], s10
+; GFX8-NEXT:    s_lshr_b64 s[18:19], s[16:17], s11
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
+; GFX8-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[16:17], s9
+; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[16:17]
+; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[4:5], s[10:11]
+; GFX8-NEXT:    s_sub_i32 s18, s8, 64
+; GFX8-NEXT:    s_sub_i32 s16, 64, s8
+; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[14:15], s8
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
+; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[16:17]
+; GFX8-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
+; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[14:15]
+; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
+; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
+; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[8:9]
+; GFX8-NEXT:    s_or_b64 s[6:7], s[10:11], s[12:13]
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_v2i128:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_movk_i32 s18, 0x7f
+; GFX9-NEXT:    s_mov_b32 s19, 0
+; GFX9-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
+; GFX9-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
+; GFX9-NEXT:    s_sub_i32 s30, 1, 64
+; GFX9-NEXT:    s_sub_i32 s31, 64, 1
+; GFX9-NEXT:    s_cmp_lt_u32 1, 64
+; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 1, 0
+; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[26:27], s[0:1], s31
+; GFX9-NEXT:    s_lshl_b64 s[28:29], s[2:3], 1
+; GFX9-NEXT:    s_lshl_b64 s[24:25], s[0:1], 1
+; GFX9-NEXT:    s_or_b64 s[26:27], s[26:27], s[28:29]
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s30
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX9-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[26:27], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX9-NEXT:    s_sub_i32 s23, s16, 64
+; GFX9-NEXT:    s_sub_i32 s17, 64, s16
+; GFX9-NEXT:    s_cmp_lt_u32 s16, 64
+; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX9-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[24:25], s16
+; GFX9-NEXT:    s_lshr_b64 s[26:27], s[24:25], s17
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
+; GFX9-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
+; GFX9-NEXT:    s_lshl_b64 s[24:25], s[24:25], s23
+; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[24:25]
+; GFX9-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX9-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
+; GFX9-NEXT:    s_sub_i32 s26, s22, 64
+; GFX9-NEXT:    s_sub_i32 s24, 64, s22
+; GFX9-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s22, 0
+; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[10:11], s22
+; GFX9-NEXT:    s_lshl_b64 s[24:25], s[10:11], s24
+; GFX9-NEXT:    s_lshr_b64 s[22:23], s[8:9], s22
+; GFX9-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s26
+; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[22:23], s[10:11]
+; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
+; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
+; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX9-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
+; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
+; GFX9-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX9-NEXT:    s_cmp_lt_u32 1, 64
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 1, 0
+; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[18:19], s[4:5], s31
+; GFX9-NEXT:    s_lshl_b64 s[20:21], s[6:7], 1
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
+; GFX9-NEXT:    s_or_b64 s[18:19], s[18:19], s[20:21]
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], s30
+; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX9-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[18:19], s[4:5]
+; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
+; GFX9-NEXT:    s_sub_i32 s9, s10, 64
+; GFX9-NEXT:    s_sub_i32 s11, 64, s10
+; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[16:17], s10
+; GFX9-NEXT:    s_lshr_b64 s[18:19], s[16:17], s11
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
+; GFX9-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[16:17], s9
+; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[16:17]
+; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[4:5], s[10:11]
+; GFX9-NEXT:    s_sub_i32 s18, s8, 64
+; GFX9-NEXT:    s_sub_i32 s16, 64, s8
+; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[14:15], s8
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
+; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[16:17]
+; GFX9-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
+; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[14:15]
+; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
+; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
+; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[8:9]
+; GFX9-NEXT:    s_or_b64 s[6:7], s[10:11], s[12:13]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_v2i128:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_movk_i32 s18, 0x7f
+; GFX10-NEXT:    s_mov_b32 s19, 0
+; GFX10-NEXT:    s_sub_i32 s30, 1, 64
+; GFX10-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
+; GFX10-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
+; GFX10-NEXT:    s_sub_i32 s31, 64, 1
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    s_mov_b32 s62, s10
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    s_mov_b32 s63, s11
+; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[24:25], s[0:1], s31
+; GFX10-NEXT:    s_lshl_b64 s[26:27], s[2:3], 1
+; GFX10-NEXT:    s_lshl_b64 s[28:29], s[0:1], 1
+; GFX10-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s30
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    s_cselect_b64 s[26:27], s[28:29], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX10-NEXT:    s_cselect_b64 s[46:47], s[2:3], s[0:1]
+; GFX10-NEXT:    s_sub_i32 s23, s16, 64
+; GFX10-NEXT:    s_sub_i32 s2, 64, s16
+; GFX10-NEXT:    s_cmp_lt_u32 s16, 64
+; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX10-NEXT:    s_lshl_b64 s[24:25], s[46:47], s16
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[26:27], s2
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[26:27], s16
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[24:25]
+; GFX10-NEXT:    s_lshl_b64 s[24:25], s[26:27], s23
+; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX10-NEXT:    s_cselect_b64 s[78:79], s[16:17], 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[24:25]
+; GFX10-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[46:47], s[2:3]
+; GFX10-NEXT:    s_sub_i32 s26, s22, 64
+; GFX10-NEXT:    s_sub_i32 s23, 64, s22
+; GFX10-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s22, 0
+; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s22
+; GFX10-NEXT:    s_lshl_b64 s[24:25], s[62:63], s23
+; GFX10-NEXT:    s_lshr_b64 s[22:23], s[62:63], s22
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[62:63], s26
+; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
+; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[22:23], 0
+; GFX10-NEXT:    s_or_b64 s[0:1], s[78:79], s[0:1]
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
+; GFX10-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[16:17], s[4:5], s31
+; GFX10-NEXT:    s_lshl_b64 s[18:19], s[6:7], 1
+; GFX10-NEXT:    s_lshl_b64 s[20:21], s[4:5], 1
+; GFX10-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], s30
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX10-NEXT:    s_cselect_b64 s[18:19], s[20:21], 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-NEXT:    s_sub_i32 s9, s10, 64
+; GFX10-NEXT:    s_sub_i32 s6, 64, s10
+; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
+; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[4:5], s10
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[18:19], s6
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[18:19], s10
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[16:17]
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[18:19], s9
+; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX10-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
+; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[16:17]
+; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_cselect_b64 s[6:7], s[4:5], s[6:7]
+; GFX10-NEXT:    s_sub_i32 s18, s8, 64
+; GFX10-NEXT:    s_sub_i32 s9, 64, s8
+; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[12:13], s8
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[14:15], s9
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[14:15], s8
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[16:17]
+; GFX10-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
+; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[14:15]
+; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX10-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10-NEXT:    ; return to shader part epilog
+  %result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
+  ret <2 x i128> %result
+}
+
+define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) {
+; GFX6-LABEL: v_fshr_v2i128:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_sub_i32 s6, 64, 1
+; GFX6-NEXT:    s_sub_i32 s7, 1, 64
+; GFX6-NEXT:    s_cmp_lt_u32 1, 64
+; GFX6-NEXT:    v_lshr_b64 v[17:18], v[0:1], s6
+; GFX6-NEXT:    v_lshl_b64 v[21:22], v[2:3], 1
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 1, 0
+; GFX6-NEXT:    v_or_b32_e32 v19, v17, v21
+; GFX6-NEXT:    v_or_b32_e32 v21, v18, v22
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    v_lshl_b64 v[17:18], v[0:1], 1
+; GFX6-NEXT:    s_and_b32 s4, 1, s4
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s7
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    s_and_b32 s4, 1, s5
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v19, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
+; GFX6-NEXT:    s_movk_i32 s8, 0x7f
+; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v16
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v21, vcc
+; GFX6-NEXT:    v_and_b32_e32 v19, s8, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v18, 0, v18, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v19
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[17:18], v2
+; GFX6-NEXT:    v_lshl_b64 v[21:22], v[0:1], v19
+; GFX6-NEXT:    v_and_b32_e32 v25, s8, v16
+; GFX6-NEXT:    v_or_b32_e32 v23, v2, v21
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v25
+; GFX6-NEXT:    v_or_b32_e32 v24, v3, v22
+; GFX6-NEXT:    v_lshl_b64 v[2:3], v[10:11], v2
+; GFX6-NEXT:    v_lshr_b64 v[21:22], v[8:9], v25
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX6-NEXT:    v_or_b32_e32 v21, v21, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v19
+; GFX6-NEXT:    v_or_b32_e32 v22, v22, v3
+; GFX6-NEXT:    v_lshl_b64 v[2:3], v[17:18], v2
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v23, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v24, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s[4:5]
+; GFX6-NEXT:    v_subrev_i32_e64 v0, s[4:5], 64, v25
+; GFX6-NEXT:    v_lshl_b64 v[16:17], v[17:18], v19
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[10:11], v0
+; GFX6-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v25
+; GFX6-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v18, v0, v21, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v19, v1, v22, s[4:5]
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[10:11], v25
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v25
+; GFX6-NEXT:    v_cndmask_b32_e32 v8, v18, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[4:5]
+; GFX6-NEXT:    v_or_b32_e32 v0, v16, v8
+; GFX6-NEXT:    v_cndmask_b32_e64 v11, 0, v1, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v9, v19, v9, vcc
+; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v20
+; GFX6-NEXT:    v_or_b32_e32 v1, v17, v9
+; GFX6-NEXT:    v_and_b32_e32 v17, s8, v8
+; GFX6-NEXT:    s_cmp_lt_u32 1, 64
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v10
+; GFX6-NEXT:    v_or_b32_e32 v3, v3, v11
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], s6
+; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], 1
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 1, 0
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, 1, s4
+; GFX6-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX6-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], 1
+; GFX6-NEXT:    v_lshl_b64 v[4:5], v[4:5], s7
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    s_and_b32 s4, 1, s5
+; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 64, v17
+; GFX6-NEXT:    v_lshr_b64 v[6:7], v[8:9], v6
+; GFX6-NEXT:    v_lshl_b64 v[10:11], v[4:5], v17
+; GFX6-NEXT:    v_subrev_i32_e32 v18, vcc, 64, v17
+; GFX6-NEXT:    v_or_b32_e32 v10, v6, v10
+; GFX6-NEXT:    v_or_b32_e32 v11, v7, v11
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[8:9], v17
+; GFX6-NEXT:    v_lshl_b64 v[8:9], v[8:9], v18
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
+; GFX6-NEXT:    v_and_b32_e32 v16, s8, v20
+; GFX6-NEXT:    v_cndmask_b32_e32 v18, 0, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v19, 0, v7, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, v9, v11, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; GFX6-NEXT:    v_cndmask_b32_e32 v8, v6, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v9, v7, v5, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 64, v16
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[12:13], v16
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[14:15], v6
+; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, 64, v16
+; GFX6-NEXT:    v_or_b32_e32 v11, v4, v6
+; GFX6-NEXT:    v_or_b32_e32 v17, v5, v7
+; GFX6-NEXT:    v_lshr_b64 v[6:7], v[14:15], v10
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[14:15], v16
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v17, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v10, 0, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v11, 0, v5, vcc
+; GFX6-NEXT:    v_or_b32_e32 v4, v18, v6
+; GFX6-NEXT:    v_or_b32_e32 v5, v19, v7
+; GFX6-NEXT:    v_or_b32_e32 v6, v8, v10
+; GFX6-NEXT:    v_or_b32_e32 v7, v9, v11
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_v2i128:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_sub_i32 s6, 64, 1
+; GFX8-NEXT:    s_sub_i32 s7, 1, 64
+; GFX8-NEXT:    s_cmp_lt_u32 1, 64
+; GFX8-NEXT:    v_lshrrev_b64 v[17:18], s6, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[21:22], 1, v[2:3]
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 1, 0
+; GFX8-NEXT:    v_or_b32_e32 v19, v17, v21
+; GFX8-NEXT:    v_or_b32_e32 v21, v18, v22
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    v_lshlrev_b64 v[17:18], 1, v[0:1]
+; GFX8-NEXT:    s_and_b32 s4, 1, s4
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s7, v[0:1]
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    s_and_b32 s4, 1, s5
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v19, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
+; GFX8-NEXT:    s_movk_i32 s8, 0x7f
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v21, vcc
+; GFX8-NEXT:    v_and_b32_e32 v19, s8, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, 0, v18, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v19
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v2, v[17:18]
+; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v19, v[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v25, s8, v16
+; GFX8-NEXT:    v_or_b32_e32 v23, v2, v21
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v25
+; GFX8-NEXT:    v_or_b32_e32 v24, v3, v22
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, v[10:11]
+; GFX8-NEXT:    v_lshrrev_b64 v[21:22], v25, v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX8-NEXT:    v_or_b32_e32 v21, v21, v2
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 64, v19
+; GFX8-NEXT:    v_or_b32_e32 v22, v22, v3
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, v[17:18]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v23, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v24, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s[4:5]
+; GFX8-NEXT:    v_subrev_u32_e64 v0, s[4:5], 64, v25
+; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v19, v[17:18]
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, v[10:11]
+; GFX8-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v25
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, v0, v21, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v1, v22, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v25, v[10:11]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v25
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v18, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[4:5]
+; GFX8-NEXT:    v_or_b32_e32 v0, v16, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, v1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v19, v9, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v8, -1, v20
+; GFX8-NEXT:    v_or_b32_e32 v1, v17, v9
+; GFX8-NEXT:    v_and_b32_e32 v17, s8, v8
+; GFX8-NEXT:    s_cmp_lt_u32 1, 64
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v10
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v11
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s6, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[10:11], 1, v[6:7]
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 1, 0
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, 1, s4
+; GFX8-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX8-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX8-NEXT:    v_lshlrev_b64 v[8:9], 1, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], s7, v[4:5]
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    s_and_b32 s4, 1, s5
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 64, v17
+; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v6, v[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v17, v[4:5]
+; GFX8-NEXT:    v_subrev_u32_e32 v18, vcc, 64, v17
+; GFX8-NEXT:    v_or_b32_e32 v10, v6, v10
+; GFX8-NEXT:    v_or_b32_e32 v11, v7, v11
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v17, v[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[8:9], v18, v[8:9]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
+; GFX8-NEXT:    v_and_b32_e32 v16, s8, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, 0, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, 0, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v11, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v6, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v7, v5, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 64, v16
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v16, v[12:13]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v6, v[14:15]
+; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, 64, v16
+; GFX8-NEXT:    v_or_b32_e32 v11, v4, v6
+; GFX8-NEXT:    v_or_b32_e32 v17, v5, v7
+; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v10, v[14:15]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v16, v[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, 0, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, 0, v5, vcc
+; GFX8-NEXT:    v_or_b32_e32 v4, v18, v6
+; GFX8-NEXT:    v_or_b32_e32 v5, v19, v7
+; GFX8-NEXT:    v_or_b32_e32 v6, v8, v10
+; GFX8-NEXT:    v_or_b32_e32 v7, v9, v11
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_v2i128:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_sub_i32 s6, 64, 1
+; GFX9-NEXT:    s_sub_i32 s7, 1, 64
+; GFX9-NEXT:    s_cmp_lt_u32 1, 64
+; GFX9-NEXT:    v_lshrrev_b64 v[17:18], s6, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[21:22], 1, v[2:3]
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 1, 0
+; GFX9-NEXT:    v_or_b32_e32 v19, v17, v21
+; GFX9-NEXT:    v_or_b32_e32 v21, v18, v22
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    v_lshlrev_b64 v[17:18], 1, v[0:1]
+; GFX9-NEXT:    s_and_b32 s4, 1, s4
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s7, v[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    s_and_b32 s4, 1, s5
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v19, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
+; GFX9-NEXT:    s_movk_i32 s8, 0x7f
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v21, vcc
+; GFX9-NEXT:    v_and_b32_e32 v19, s8, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v18, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v19
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v2, v[17:18]
+; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v19, v[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v25, s8, v16
+; GFX9-NEXT:    v_or_b32_e32 v23, v2, v21
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v25
+; GFX9-NEXT:    v_or_b32_e32 v24, v3, v22
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, v[10:11]
+; GFX9-NEXT:    v_lshrrev_b64 v[21:22], v25, v[8:9]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX9-NEXT:    v_or_b32_e32 v21, v21, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 64, v19
+; GFX9-NEXT:    v_or_b32_e32 v22, v22, v3
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, v[17:18]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v23, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v24, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
+; GFX9-NEXT:    v_subrev_u32_e32 v0, 64, v25
+; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v19, v[17:18]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v1, s[4:5]
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, v[10:11]
+; GFX9-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, v0, v21, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v1, v22, s[4:5]
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v25, v[10:11]
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v18, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[4:5]
+; GFX9-NEXT:    v_or_b32_e32 v0, v16, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v19, v9, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v20
+; GFX9-NEXT:    v_or_b32_e32 v1, v17, v9
+; GFX9-NEXT:    v_and_b32_e32 v17, s8, v8
+; GFX9-NEXT:    s_cmp_lt_u32 1, 64
+; GFX9-NEXT:    v_or_b32_e32 v2, v2, v10
+; GFX9-NEXT:    v_or_b32_e32 v3, v3, v11
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s6, v[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], 1, v[6:7]
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 1, 0
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, 1, s4
+; GFX9-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX9-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX9-NEXT:    v_lshlrev_b64 v[8:9], 1, v[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], s7, v[4:5]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    s_and_b32 s4, 1, s5
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v6, 64, v17
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v6, v[8:9]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v17, v[4:5]
+; GFX9-NEXT:    v_subrev_u32_e32 v18, 64, v17
+; GFX9-NEXT:    v_or_b32_e32 v10, v6, v10
+; GFX9-NEXT:    v_or_b32_e32 v11, v7, v11
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], v17, v[8:9]
+; GFX9-NEXT:    v_lshlrev_b64 v[8:9], v18, v[8:9]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
+; GFX9-NEXT:    v_and_b32_e32 v16, s8, v20
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, 0, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v11, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v6, v4, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v6, 64, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v7, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v16, v[12:13]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], v6, v[14:15]
+; GFX9-NEXT:    v_subrev_u32_e32 v10, 64, v16
+; GFX9-NEXT:    v_or_b32_e32 v11, v4, v6
+; GFX9-NEXT:    v_or_b32_e32 v17, v5, v7
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v10, v[14:15]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v16, v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, 0, v5, vcc
+; GFX9-NEXT:    v_or_b32_e32 v4, v18, v6
+; GFX9-NEXT:    v_or_b32_e32 v5, v19, v7
+; GFX9-NEXT:    v_or_b32_e32 v6, v8, v10
+; GFX9-NEXT:    v_or_b32_e32 v7, v9, v11
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_v2i128:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_sub_i32 s5, 64, 1
+; GFX10-NEXT:    s_sub_i32 s6, 1, 64
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    v_lshrrev_b64 v[27:28], s5, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[21:22], 1, v[2:3]
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[23:24], 1, v[0:1]
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10-NEXT:    s_and_b32 s4, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s6, v[0:1]
+; GFX10-NEXT:    v_xor_b32_e32 v19, -1, v16
+; GFX10-NEXT:    v_or_b32_e32 v21, v27, v21
+; GFX10-NEXT:    v_or_b32_e32 v18, v28, v22
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s4
+; GFX10-NEXT:    s_movk_i32 s7, 0x7f
+; GFX10-NEXT:    s_and_b32 s8, 1, s8
+; GFX10-NEXT:    v_and_b32_e32 v31, s7, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, 0, v23, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v18, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v21, s4
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s8
+; GFX10-NEXT:    v_cndmask_b32_e32 v18, 0, v24, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v19, 64, v31
+; GFX10-NEXT:    v_and_b32_e32 v26, s7, v16
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v16, 64, v31
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s4
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v19, v[17:18]
+; GFX10-NEXT:    v_mov_b32_e32 v35, v10
+; GFX10-NEXT:    v_mov_b32_e32 v36, v11
+; GFX10-NEXT:    v_sub_nc_u32_e32 v25, 64, v26
+; GFX10-NEXT:    v_lshlrev_b64 v[21:22], v31, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[23:24], v31, v[17:18]
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v31
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v29, 64, v26
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v26
+; GFX10-NEXT:    v_lshrrev_b64 v[27:28], s5, v[4:5]
+; GFX10-NEXT:    v_or_b32_e32 v21, v2, v21
+; GFX10-NEXT:    v_or_b32_e32 v22, v3, v22
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v16, v[17:18]
+; GFX10-NEXT:    v_lshlrev_b64 v[18:19], v25, v[35:36]
+; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v26, v[8:9]
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, 0, v23, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v39, 0, v24, vcc_lo
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v2, v21, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v3, v22, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v29, v[35:36]
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v31
+; GFX10-NEXT:    v_or_b32_e32 v16, v16, v18
+; GFX10-NEXT:    v_or_b32_e32 v17, v17, v19
+; GFX10-NEXT:    v_lshlrev_b64 v[10:11], 1, v[6:7]
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v18, v21, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v31, v22, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v16, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v26
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v3, v17, s4
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], 1, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], s6, v[4:5]
+; GFX10-NEXT:    s_and_b32 s6, 1, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v2, v8, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v25, -1, v20
+; GFX10-NEXT:    v_or_b32_e32 v2, v27, v10
+; GFX10-NEXT:    v_or_b32_e32 v3, v28, v11
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s6, 0, s6
+; GFX10-NEXT:    s_and_b32 s8, 1, s8
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v26, v[35:36]
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, v16, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v19, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v4, v2, s6
+; GFX10-NEXT:    v_and_b32_e32 v30, s7, v25
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, v3, s6
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, v17, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0, v0, s4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v30
+; GFX10-NEXT:    v_or_b32_e32 v0, v23, v21
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v23, s7, v20
+; GFX10-NEXT:    v_lshrrev_b64 v[5:6], v2, v[8:9]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, 64, v30
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, v1, s4
+; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v30, v[3:4]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v20, 64, v23
+; GFX10-NEXT:    v_or_b32_e32 v1, v39, v16
+; GFX10-NEXT:    v_or_b32_e32 v2, v18, v19
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v30, v[8:9]
+; GFX10-NEXT:    v_lshrrev_b64 v[18:19], v23, v[12:13]
+; GFX10-NEXT:    v_or_b32_e32 v10, v5, v10
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, 64, v23
+; GFX10-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v30
+; GFX10-NEXT:    v_lshlrev_b64 v[7:8], v7, v[8:9]
+; GFX10-NEXT:    v_or_b32_e32 v9, v6, v11
+; GFX10-NEXT:    v_lshrrev_b64 v[34:35], v5, v[14:15]
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v23
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v16, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v16, v18, v20
+; GFX10-NEXT:    v_or_b32_e32 v18, v19, v21
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v7, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[7:8], v23, v[14:15]
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v30
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v34, v16, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v23
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v35, v18, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0, v17, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v10, v3, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v4, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, v12, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, v13, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, v7, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, v8, s4
+; GFX10-NEXT:    v_or_b32_e32 v3, v31, v26
+; GFX10-NEXT:    v_or_b32_e32 v4, v11, v4
+; GFX10-NEXT:    v_or_b32_e32 v5, v14, v5
+; GFX10-NEXT:    v_or_b32_e32 v6, v15, v6
+; GFX10-NEXT:    v_or_b32_e32 v7, v9, v7
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
+  ret <2 x i128> %result
+}
+
+declare i7 @llvm.fshr.i7(i7, i7, i7) #0
+declare i8 @llvm.fshr.i8(i8, i8, i8) #0
+declare <2 x i8> @llvm.fshr.v2i8(<2 x i8>, <2 x i8>, <2 x i8>) #0
+declare <4 x i8> @llvm.fshr.v4i8(<4 x i8>, <4 x i8>, <4 x i8>) #0
+
+declare i16 @llvm.fshr.i16(i16, i16, i16) #0
+declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>) #0
+declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>) #0
+declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) #0
+declare <5 x i16> @llvm.fshr.v5i16(<5 x i16>, <5 x i16>, <5 x i16>) #0
+declare <6 x i16> @llvm.fshr.v6i16(<6 x i16>, <6 x i16>, <6 x i16>) #0
+declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) #0
+
+declare i24 @llvm.fshr.i24(i24, i24, i24) #0
+declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>) #0
+
+declare i32 @llvm.fshr.i32(i32, i32, i32) #0
+declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) #0
+declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>) #0
+declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #0
+declare <5 x i32> @llvm.fshr.v5i32(<5 x i32>, <5 x i32>, <5 x i32>) #0
+declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>) #0
+
+declare i48 @llvm.fshr.i48(i48, i48, i48) #0
+
+declare i64 @llvm.fshr.i64(i64, i64, i64) #0
+declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) #0
+
+declare i128 @llvm.fshr.i128(i128, i128, i128) #0
+declare <2 x i128> @llvm.fshr.v2i128(<2 x i128>, <2 x i128>, <2 x i128>) #0
+
+attributes #0 = { nounwind readnone speculatable willreturn }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir
new file mode 100644
index 000000000000..93af75e3d6ed
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir
@@ -0,0 +1,1254 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9  %s
+
+---
+name: test_fshl_s32_s32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; SI-LABEL: name: test_fshl_s32_s32
+    ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SI: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY1]], [[C]](s32)
+    ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; SI: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]]
+    ; SI: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32)
+    ; SI: $vgpr0 = COPY [[FSHR1]](s32)
+    ; VI-LABEL: name: test_fshl_s32_s32
+    ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; VI: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY1]], [[C]](s32)
+    ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; VI: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]]
+    ; VI: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32)
+    ; VI: $vgpr0 = COPY [[FSHR1]](s32)
+    ; GFX9-LABEL: name: test_fshl_s32_s32
+    ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY1]], [[C]](s32)
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; GFX9: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]]
+    ; GFX9: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32)
+    ; GFX9: $vgpr0 = COPY [[FSHR1]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = G_FSHL %0, %1, %2
+    $vgpr0 = COPY %3
+...
+
+---
+name: test_fshl_v2s32_v2s32
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
+
+    ; SI-LABEL: name: test_fshl_v2s32_v2s32
+    ; SI: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
+    ; SI: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
+    ; SI: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr4_vgpr5
+    ; SI: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+    ; SI: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+    ; SI: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>)
+    ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SI: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV2]], [[C]](s32)
+    ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
+    ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; SI: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV4]], [[C1]]
+    ; SI: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32)
+    ; SI: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV3]], [[C]](s32)
+    ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
+    ; SI: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV5]], [[C1]]
+    ; SI: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[LSHR1]], [[FSHR2]], [[XOR1]](s32)
+    ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSHR1]](s32), [[FSHR3]](s32)
+    ; SI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ; VI-LABEL: name: test_fshl_v2s32_v2s32
+    ; VI: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
+    ; VI: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
+    ; VI: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr4_vgpr5
+    ; VI: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+    ; VI: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+    ; VI: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>)
+    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; VI: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV2]], [[C]](s32)
+    ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
+    ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; VI: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV4]], [[C1]]
+    ; VI: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32)
+    ; VI: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV3]], [[C]](s32)
+    ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
+    ; VI: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV5]], [[C1]]
+    ; VI: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[LSHR1]], [[FSHR2]], [[XOR1]](s32)
+    ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSHR1]](s32), [[FSHR3]](s32)
+    ; VI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ; GFX9-LABEL: name: test_fshl_v2s32_v2s32
+    ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
+    ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
+    ; GFX9: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr4_vgpr5
+    ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+    ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+    ; GFX9: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV2]], [[C]](s32)
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; GFX9: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV4]], [[C1]]
+    ; GFX9: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32)
+    ; GFX9: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV3]], [[C]](s32)
+    ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
+    ; GFX9: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV5]], [[C1]]
+    ; GFX9: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[LSHR1]], [[FSHR2]], [[XOR1]](s32)
+    ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSHR1]](s32), [[FSHR3]](s32)
+    ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
+    %1:_(<2 x s32>) = COPY $vgpr2_vgpr3
+    %2:_(<2 x s32>) = COPY $vgpr4_vgpr5
+    %3:_(<2 x s32>) = G_FSHL %0, %1, %2
+    $vgpr0_vgpr1 = COPY %3
+...
+
+---
+name: test_fshl_s16_s16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; SI-LABEL: name: test_fshl_s16_s16
+    ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; SI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; SI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
+    ; SI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+    ; SI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C1]]
+    ; SI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C]]
+    ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16)
+    ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[ZEXT]](s32)
+    ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
+    ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C2]](s32)
+    ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
+    ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]]
+    ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32)
+    ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; SI: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC1]], [[TRUNC2]]
+    ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
+    ; SI: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; VI-LABEL: name: test_fshl_s16_s16
+    ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; VI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C]]
+    ; VI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+    ; VI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C1]]
+    ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C]]
+    ; VI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+    ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16)
+    ; VI: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C2]](s16)
+    ; VI: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[LSHR]], [[AND1]](s16)
+    ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL]], [[LSHR1]]
+    ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
+    ; VI: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9-LABEL: name: test_fshl_s16_s16
+    ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C]]
+    ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+    ; GFX9: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C1]]
+    ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+    ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16)
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C2]](s16)
+    ; GFX9: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[LSHR]], [[AND1]](s16)
+    ; GFX9: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL]], [[LSHR1]]
+    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
+    ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s16) = G_TRUNC %0
+    %4:_(s16) = G_TRUNC %1
+    %5:_(s16) = G_TRUNC %2
+    %6:_(s16) = G_FSHL %3, %4, %5
+    %7:_(s32) = G_ANYEXT %6
+    $vgpr0 = COPY %7
+...
+
+---
+name: test_fshl_v2s16_v2s16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; SI-LABEL: name: test_fshl_v2s16_v2s16
+    ; SI: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; SI: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; SI: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>)
+    ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+    ; SI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; SI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]]
+    ; SI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+    ; SI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C2]]
+    ; SI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]]
+    ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16)
+    ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
+    ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[ZEXT]](s32)
+    ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
+    ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
+    ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C4]]
+    ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s32)
+    ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
+    ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
+    ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C4]]
+    ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32)
+    ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32)
+    ; SI: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC2]], [[TRUNC3]]
+    ; SI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]]
+    ; SI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C2]]
+    ; SI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]]
+    ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+    ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[ZEXT2]](s32)
+    ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
+    ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C4]]
+    ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY7]](s32)
+    ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
+    ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32)
+    ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C4]]
+    ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT3]](s32)
+    ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32)
+    ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC4]], [[TRUNC5]]
+    ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C]](s32)
+    ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL2]]
+    ; SI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
+    ; SI: $vgpr0 = COPY [[BITCAST3]](<2 x s16>)
+    ; VI-LABEL: name: test_fshl_v2s16_v2s16
+    ; VI: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; VI: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; VI: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>)
+    ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+    ; VI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C1]]
+    ; VI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+    ; VI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C2]]
+    ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]]
+    ; VI: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+    ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16)
+    ; VI: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C3]](s16)
+    ; VI: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[LSHR3]], [[AND1]](s16)
+    ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL]], [[LSHR4]]
+    ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C1]]
+    ; VI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C2]]
+    ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]]
+    ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16)
+    ; VI: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C3]](s16)
+    ; VI: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[LSHR5]], [[AND3]](s16)
+    ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[SHL1]], [[LSHR6]]
+    ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
+    ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
+    ; VI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
+    ; VI: $vgpr0 = COPY [[BITCAST3]](<2 x s16>)
+    ; GFX9-LABEL: name: test_fshl_v2s16_v2s16
+    ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[C]](s32)
+    ; GFX9: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY2]], [[BUILD_VECTOR_TRUNC]]
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[C1]](s32)
+    ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY2]], [[BUILD_VECTOR_TRUNC1]]
+    ; GFX9: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR_TRUNC]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[C2]](s32)
+    ; GFX9: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[COPY]], [[AND]](<2 x s16>)
+    ; GFX9: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[COPY1]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>)
+    ; GFX9: [[LSHR1:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[LSHR]], [[AND1]](<2 x s16>)
+    ; GFX9: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL]], [[LSHR1]]
+    ; GFX9: $vgpr0 = COPY [[OR]](<2 x s16>)
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $vgpr1
+    %2:_(<2 x s16>) = COPY $vgpr2
+    %3:_(<2 x s16>) = G_FSHL %0, %1, %2
+    $vgpr0 = COPY %3
+...
+
+---
+name: test_fshl_s64_s64
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
+
+    ; SI-LABEL: name: test_fshl_s64_s64
+    ; SI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; SI: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; SI: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5
+    ; SI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
+    ; SI: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[C]]
+    ; SI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; SI: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY2]], [[C1]]
+    ; SI: [[AND1:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[C]]
+    ; SI: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
+    ; SI: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[TRUNC]](s32)
+    ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SI: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C2]](s32)
+    ; SI: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
+    ; SI: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[LSHR]], [[TRUNC1]](s32)
+    ; SI: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[LSHR1]]
+    ; SI: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ; VI-LABEL: name: test_fshl_s64_s64
+    ; VI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; VI: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; VI: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5
+    ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
+    ; VI: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[C]]
+    ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; VI: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY2]], [[C1]]
+    ; VI: [[AND1:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[C]]
+    ; VI: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
+    ; VI: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[TRUNC]](s32)
+    ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; VI: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C2]](s32)
+    ; VI: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
+    ; VI: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[LSHR]], [[TRUNC1]](s32)
+    ; VI: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[LSHR1]]
+    ; VI: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ; GFX9-LABEL: name: test_fshl_s64_s64
+    ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX9: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX9: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5
+    ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
+    ; GFX9: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[C]]
+    ; GFX9: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; GFX9: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY2]], [[C1]]
+    ; GFX9: [[AND1:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[C]]
+    ; GFX9: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
+    ; GFX9: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[TRUNC]](s32)
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C2]](s32)
+    ; GFX9: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
+    ; GFX9: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[LSHR]], [[TRUNC1]](s32)
+    ; GFX9: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[LSHR1]]
+    ; GFX9: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(s64) = COPY $vgpr4_vgpr5
+    %3:_(s64) = G_FSHL %0, %1, %2
+    $vgpr0_vgpr1 = COPY %3
+...
+
+---
+name: test_fshl_s8_s8
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; SI-LABEL: name: test_fshl_s8_s8
+    ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[COPY4]]
+    ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; SI: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY5]], [[COPY6]]
+    ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[XOR]](s32)
+    ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[COPY8]]
+    ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[AND]](s32)
+    ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]]
+    ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[AND2]](s32)
+    ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]]
+    ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]]
+    ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[AND3]](s32)
+    ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[AND1]](s32)
+    ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]]
+    ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]]
+    ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[AND5]](s32)
+    ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[SHL]](s32)
+    ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[COPY16]]
+    ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; SI: $vgpr0 = COPY [[COPY17]](s32)
+    ; VI-LABEL: name: test_fshl_s8_s8
+    ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[COPY4]]
+    ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; VI: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY5]], [[COPY6]]
+    ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[XOR]](s32)
+    ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[COPY8]]
+    ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[AND]](s32)
+    ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]]
+    ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[AND2]](s32)
+    ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16)
+    ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
+    ; VI: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[AND3]](s32)
+    ; VI: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND4]], [[TRUNC3]](s16)
+    ; VI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[AND1]](s32)
+    ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]]
+    ; VI: [[COPY12:%[0-9]+]]:_(s16) = COPY [[LSHR]](s16)
+    ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[COPY12]], [[C4]]
+    ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[AND5]](s32)
+    ; VI: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[AND6]], [[TRUNC4]](s16)
+    ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL]](s16)
+    ; VI: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16)
+    ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
+    ; VI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; VI: $vgpr0 = COPY [[COPY13]](s32)
+    ; GFX9-LABEL: name: test_fshl_s8_s8
+    ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[COPY4]]
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; GFX9: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY5]], [[COPY6]]
+    ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[XOR]](s32)
+    ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[COPY8]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[AND]](s32)
+    ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]]
+    ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[AND2]](s32)
+    ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16)
+    ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
+    ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[AND3]](s32)
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND4]], [[TRUNC3]](s16)
+    ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[AND1]](s32)
+    ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]]
+    ; GFX9: [[COPY12:%[0-9]+]]:_(s16) = COPY [[LSHR]](s16)
+    ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[COPY12]], [[C4]]
+    ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[AND5]](s32)
+    ; GFX9: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[AND6]], [[TRUNC4]](s16)
+    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL]](s16)
+    ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16)
+    ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
+    ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; GFX9: $vgpr0 = COPY [[COPY13]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s8) = G_TRUNC %0
+    %4:_(s8) = G_TRUNC %1
+    %5:_(s8) = G_TRUNC %2
+    %6:_(s8) = G_FSHL %3, %4, %5
+    %7:_(s32) = G_ANYEXT %6
+    $vgpr0 = COPY %7
+...
+
+---
+name: test_fshl_s24_s24
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; SI-LABEL: name: test_fshl_s24_s24
+    ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 23
+    ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]]
+    ; SI: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; SI: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; SI: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000
+    ; SI: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]]
+    ; SI: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; SI: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[AND1]]
+    ; SI: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]]
+    ; SI: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]]
+    ; SI: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]]
+    ; SI: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]]
+    ; SI: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[AND1]]
+    ; SI: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SI: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[AND1]]
+    ; SI: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND1]]
+    ; SI: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]]
+    ; SI: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[AND1]]
+    ; SI: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]]
+    ; SI: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
+    ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; SI: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY5]], [[COPY6]]
+    ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]]
+    ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[AND2]](s32)
+    ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C2]]
+    ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C2]]
+    ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[AND3]](s32)
+    ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[SUB4]](s32)
+    ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C2]]
+    ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C2]]
+    ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[AND5]](s32)
+    ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[SHL]](s32)
+    ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY13]], [[COPY14]]
+    ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; SI: $vgpr0 = COPY [[COPY15]](s32)
+    ; VI-LABEL: name: test_fshl_s24_s24
+    ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 23
+    ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]]
+    ; VI: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; VI: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; VI: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000
+    ; VI: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]]
+    ; VI: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; VI: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[AND1]]
+    ; VI: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]]
+    ; VI: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]]
+    ; VI: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]]
+    ; VI: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]]
+    ; VI: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[AND1]]
+    ; VI: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; VI: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[AND1]]
+    ; VI: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND1]]
+    ; VI: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]]
+    ; VI: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[AND1]]
+    ; VI: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]]
+    ; VI: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
+    ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; VI: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY5]], [[COPY6]]
+    ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]]
+    ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[AND2]](s32)
+    ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C2]]
+    ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C2]]
+    ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[AND3]](s32)
+    ; VI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[SUB4]](s32)
+    ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C2]]
+    ; VI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C2]]
+    ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[AND5]](s32)
+    ; VI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[SHL]](s32)
+    ; VI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY13]], [[COPY14]]
+    ; VI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; VI: $vgpr0 = COPY [[COPY15]](s32)
+    ; GFX9-LABEL: name: test_fshl_s24_s24
+    ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 23
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]]
+    ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[AND1]]
+    ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]]
+    ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]]
+    ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]]
+    ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]]
+    ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[AND1]]
+    ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[AND1]]
+    ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND1]]
+    ; GFX9: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]]
+    ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[AND1]]
+    ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]]
+    ; GFX9: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
+    ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; GFX9: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY5]], [[COPY6]]
+    ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]]
+    ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[AND2]](s32)
+    ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C2]]
+    ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C2]]
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[AND3]](s32)
+    ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[SUB4]](s32)
+    ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C2]]
+    ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX9: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C2]]
+    ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[AND5]](s32)
+    ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[SHL]](s32)
+    ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY13]], [[COPY14]]
+    ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; GFX9: $vgpr0 = COPY [[COPY15]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s24) = G_TRUNC %0
+    %4:_(s24) = G_TRUNC %1
+    %5:_(s24) = G_TRUNC %2
+    %6:_(s24) = G_FSHL %3, %4, %5
+    %7:_(s32) = G_ANYEXT %6
+    $vgpr0 = COPY %7
+...
+
+---
+name: test_fshl_v3s16_v3s16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    ; SI-LABEL: name: test_fshl_v3s16_v3s16
+    ; SI: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; SI: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; SI: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; SI: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; SI: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; SI: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; SI: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+    ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>)
+    ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>)
+    ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
+    ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>)
+    ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32)
+    ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
+    ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32)
+    ; SI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>)
+    ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32)
+    ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32)
+    ; SI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; SI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]]
+    ; SI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+    ; SI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C2]]
+    ; SI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]]
+    ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16)
+    ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
+    ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[ZEXT]](s32)
+    ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
+    ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+    ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C4]]
+    ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s32)
+    ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
+    ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32)
+    ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C4]]
+    ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32)
+    ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
+    ; SI: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC3]], [[TRUNC4]]
+    ; SI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]]
+    ; SI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C2]]
+    ; SI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]]
+    ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+    ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY9]], [[ZEXT2]](s32)
+    ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
+    ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+    ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C4]]
+    ; SI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY10]](s32)
+    ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
+    ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32)
+    ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C4]]
+    ; SI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT3]](s32)
+    ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32)
+    ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC5]], [[TRUNC6]]
+    ; SI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C1]]
+    ; SI: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C2]]
+    ; SI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]]
+    ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16)
+    ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
+    ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[ZEXT4]](s32)
+    ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32)
+    ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32)
+    ; SI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C4]]
+    ; SI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY14]](s32)
+    ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16)
+    ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32)
+    ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C4]]
+    ; SI: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND11]], [[ZEXT5]](s32)
+    ; SI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32)
+    ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[TRUNC7]], [[TRUNC8]]
+    ; SI: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+    ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; SI: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>)
+    ; SI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; SI: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32)
+    ; SI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; SI: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32)
+    ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C]](s32)
+    ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL3]]
+    ; SI: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
+    ; SI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32)
+    ; SI: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C4]]
+    ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C]](s32)
+    ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL4]]
+    ; SI: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
+    ; SI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LSHR12]](s32)
+    ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C4]]
+    ; SI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32)
+    ; SI: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C4]]
+    ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C]](s32)
+    ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND13]], [[SHL5]]
+    ; SI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
+    ; SI: $vgpr0 = COPY [[BITCAST8]](<2 x s16>)
+    ; SI: $vgpr1 = COPY [[BITCAST9]](<2 x s16>)
+    ; SI: $vgpr2 = COPY [[BITCAST10]](<2 x s16>)
+    ; VI-LABEL: name: test_fshl_v3s16_v3s16
+    ; VI: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; VI: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; VI: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; VI: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; VI: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; VI: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; VI: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+    ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>)
+    ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+    ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>)
+    ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32)
+    ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
+    ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>)
+    ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32)
+    ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
+    ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32)
+    ; VI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>)
+    ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32)
+    ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32)
+    ; VI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C1]]
+    ; VI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+    ; VI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC6]], [[C2]]
+    ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]]
+    ; VI: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+    ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16)
+    ; VI: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C3]](s16)
+    ; VI: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[LSHR6]], [[AND1]](s16)
+    ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL]], [[LSHR7]]
+    ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C1]]
+    ; VI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC7]], [[C2]]
+    ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]]
+    ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16)
+    ; VI: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC4]], [[C3]](s16)
+    ; VI: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[LSHR8]], [[AND3]](s16)
+    ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[SHL1]], [[LSHR9]]
+    ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C1]]
+    ; VI: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC8]], [[C2]]
+    ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]]
+    ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[AND4]](s16)
+    ; VI: [[LSHR10:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC5]], [[C3]](s16)
+    ; VI: [[LSHR11:%[0-9]+]]:_(s16) = G_LSHR [[LSHR10]], [[AND5]](s16)
+    ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[SHL2]], [[LSHR11]]
+    ; VI: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+    ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; VI: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>)
+    ; VI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; VI: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32)
+    ; VI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; VI: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32)
+    ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
+    ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; VI: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
+    ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32)
+    ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C4]]
+    ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32)
+    ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL4]]
+    ; VI: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
+    ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR12]](s32)
+    ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C4]]
+    ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32)
+    ; VI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C4]]
+    ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32)
+    ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL5]]
+    ; VI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
+    ; VI: $vgpr0 = COPY [[BITCAST8]](<2 x s16>)
+    ; VI: $vgpr1 = COPY [[BITCAST9]](<2 x s16>)
+    ; VI: $vgpr2 = COPY [[BITCAST10]](<2 x s16>)
+    ; GFX9-LABEL: name: test_fshl_v3s16_v3s16
+    ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX9: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; GFX9: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; GFX9: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; GFX9: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+    ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
+    ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
+    ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>)
+    ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>)
+    ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
+    ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+    ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32)
+    ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
+    ; GFX9: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>)
+    ; GFX9: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
+    ; GFX9: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>)
+    ; GFX9: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32)
+    ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32)
+    ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
+    ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF1]](s32)
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[C1]](s32)
+    ; GFX9: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[BUILD_VECTOR_TRUNC4]], [[BUILD_VECTOR_TRUNC6]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[C2]](s32)
+    ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BUILD_VECTOR_TRUNC4]], [[BUILD_VECTOR_TRUNC7]]
+    ; GFX9: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR_TRUNC6]]
+    ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[C3]](s32)
+    ; GFX9: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC]], [[AND]](<2 x s16>)
+    ; GFX9: [[LSHR6:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR_TRUNC2]], [[BUILD_VECTOR_TRUNC8]](<2 x s16>)
+    ; GFX9: [[LSHR7:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[LSHR6]], [[AND1]](<2 x s16>)
+    ; GFX9: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL]], [[LSHR7]]
+    ; GFX9: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; GFX9: [[COPY21:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32)
+    ; GFX9: [[AND2:%[0-9]+]]:_(<2 x s16>) = G_AND [[BUILD_VECTOR_TRUNC5]], [[BUILD_VECTOR_TRUNC9]]
+    ; GFX9: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; GFX9: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC10:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY22]](s32), [[COPY23]](s32)
+    ; GFX9: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BUILD_VECTOR_TRUNC5]], [[BUILD_VECTOR_TRUNC10]]
+    ; GFX9: [[AND3:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR1]], [[BUILD_VECTOR_TRUNC9]]
+    ; GFX9: [[COPY24:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; GFX9: [[COPY25:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC11:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY24]](s32), [[COPY25]](s32)
+    ; GFX9: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC1]], [[AND2]](<2 x s16>)
+    ; GFX9: [[LSHR8:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR_TRUNC3]], [[BUILD_VECTOR_TRUNC11]](<2 x s16>)
+    ; GFX9: [[LSHR9:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[LSHR8]], [[AND3]](<2 x s16>)
+    ; GFX9: [[OR1:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL1]], [[LSHR9]]
+    ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX9: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX9: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[OR]](<2 x s16>)
+    ; GFX9: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32)
+    ; GFX9: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[OR1]](<2 x s16>)
+    ; GFX9: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32)
+    ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>)
+    ; GFX9: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX9: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32)
+    ; GFX9: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX9: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32)
+    ; GFX9: [[COPY26:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32)
+    ; GFX9: [[COPY27:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC12:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY26]](s32), [[COPY27]](s32)
+    ; GFX9: [[COPY28:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32)
+    ; GFX9: [[COPY29:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC13:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY28]](s32), [[COPY29]](s32)
+    ; GFX9: [[COPY30:%[0-9]+]]:_(s32) = COPY [[LSHR12]](s32)
+    ; GFX9: [[COPY31:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC14:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY30]](s32), [[COPY31]](s32)
+    ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC12]](<2 x s16>)
+    ; GFX9: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC13]](<2 x s16>)
+    ; GFX9: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC14]](<2 x s16>)
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $vgpr1
+    %2:_(<2 x s16>) = COPY $vgpr2
+    %3:_(<2 x s16>) = COPY $vgpr3
+    %4:_(<2 x s16>) = COPY $vgpr4
+    %5:_(<2 x s16>) = COPY $vgpr5
+    %6:_(<2 x s16>) = G_IMPLICIT_DEF
+    %7:_(<6 x s16>) = G_CONCAT_VECTORS %0(<2 x s16>), %1(<2 x s16>), %6(<2 x s16>)
+    %8:_(<3 x s16>), %9:_(<3 x s16>) = G_UNMERGE_VALUES %7(<6 x s16>)
+    %10:_(<6 x s16>) = G_CONCAT_VECTORS %2(<2 x s16>), %3(<2 x s16>), %6(<2 x s16>)
+    %11:_(<3 x s16>), %12:_(<3 x s16>) = G_UNMERGE_VALUES %10(<6 x s16>)
+    %13:_(<6 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>), %6(<2 x s16>)
+    %14:_(<3 x s16>), %15:_(<3 x s16>) = G_UNMERGE_VALUES %13(<6 x s16>)
+    %16:_(<3 x s16>) = G_FSHL %8, %11, %14(<3 x s16>)
+    %17:_(<3 x s16>) = G_IMPLICIT_DEF
+    %18:_(<6 x s16>) = G_CONCAT_VECTORS %16(<3 x s16>), %17(<3 x s16>)
+    %19:_(<2 x s16>), %20:_(<2 x s16>), %21:_(<2 x s16>) = G_UNMERGE_VALUES %18(<6 x s16>)
+    $vgpr0 = COPY %19(<2 x s16>)
+    $vgpr1 = COPY %20(<2 x s16>)
+    $vgpr2 = COPY %21(<2 x s16>)
+...
+
+---
+name: test_fshl_v4s16_v4s16
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
+
+    ; SI-LABEL: name: test_fshl_v4s16_v4s16
+    ; SI: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
+    ; SI: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
+    ; SI: [[COPY2:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr4_vgpr5
+    ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+    ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; SI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
+    ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
+    ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+    ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
+    ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<4 x s16>)
+    ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>)
+    ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32)
+    ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
+    ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32)
+    ; SI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>)
+    ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32)
+    ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32)
+    ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32)
+    ; SI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; SI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]]
+    ; SI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+    ; SI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C2]]
+    ; SI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]]
+    ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16)
+    ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
+    ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[ZEXT]](s32)
+    ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
+    ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+    ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C4]]
+    ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s32)
+    ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
+    ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32)
+    ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C4]]
+    ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32)
+    ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
+    ; SI: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC4]], [[TRUNC5]]
+    ; SI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]]
+    ; SI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C2]]
+    ; SI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]]
+    ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+    ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[ZEXT2]](s32)
+    ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
+    ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+    ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C4]]
+    ; SI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY7]](s32)
+    ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
+    ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32)
+    ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C4]]
+    ; SI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT3]](s32)
+    ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32)
+    ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC6]], [[TRUNC7]]
+    ; SI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C1]]
+    ; SI: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C2]]
+    ; SI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]]
+    ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16)
+    ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
+    ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[ZEXT4]](s32)
+    ; SI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32)
+    ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32)
+    ; SI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C4]]
+    ; SI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY11]](s32)
+    ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16)
+    ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32)
+    ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C4]]
+    ; SI: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND11]], [[ZEXT5]](s32)
+    ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32)
+    ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[TRUNC8]], [[TRUNC9]]
+    ; SI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C1]]
+    ; SI: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC3]], [[C2]]
+    ; SI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C1]]
+    ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND12]](s16)
+    ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY14]], [[ZEXT6]](s32)
+    ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32)
+    ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
+    ; SI: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C4]]
+    ; SI: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[AND14]], [[COPY15]](s32)
+    ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND13]](s16)
+    ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR12]](s32)
+    ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C4]]
+    ; SI: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND15]], [[ZEXT7]](s32)
+    ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR13]](s32)
+    ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[TRUNC10]], [[TRUNC11]]
+    ; SI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; SI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C]](s32)
+    ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL4]]
+    ; SI: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
+    ; SI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; SI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C]](s32)
+    ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL5]]
+    ; SI: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
+    ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
+    ; SI: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; VI-LABEL: name: test_fshl_v4s16_v4s16
+    ; VI: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
+    ; VI: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
+    ; VI: [[COPY2:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr4_vgpr5
+    ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+    ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; VI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
+    ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
+    ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+    ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+    ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32)
+    ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
+    ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
+    ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<4 x s16>)
+    ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>)
+    ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32)
+    ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
+    ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32)
+    ; VI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>)
+    ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32)
+    ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32)
+    ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32)
+    ; VI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C1]]
+    ; VI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+    ; VI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC8]], [[C2]]
+    ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]]
+    ; VI: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+    ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16)
+    ; VI: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC4]], [[C3]](s16)
+    ; VI: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[LSHR6]], [[AND1]](s16)
+    ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL]], [[LSHR7]]
+    ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C1]]
+    ; VI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC9]], [[C2]]
+    ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]]
+    ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16)
+    ; VI: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC5]], [[C3]](s16)
+    ; VI: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[LSHR8]], [[AND3]](s16)
+    ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[SHL1]], [[LSHR9]]
+    ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C1]]
+    ; VI: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC10]], [[C2]]
+    ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]]
+    ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[AND4]](s16)
+    ; VI: [[LSHR10:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC6]], [[C3]](s16)
+    ; VI: [[LSHR11:%[0-9]+]]:_(s16) = G_LSHR [[LSHR10]], [[AND5]](s16)
+    ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[SHL2]], [[LSHR11]]
+    ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C1]]
+    ; VI: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC11]], [[C2]]
+    ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C1]]
+    ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[AND6]](s16)
+    ; VI: [[LSHR12:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC7]], [[C3]](s16)
+    ; VI: [[LSHR13:%[0-9]+]]:_(s16) = G_LSHR [[LSHR12]], [[AND7]](s16)
+    ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[SHL3]], [[LSHR13]]
+    ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
+    ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; VI: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
+    ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32)
+    ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]]
+    ; VI: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
+    ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
+    ; VI: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX9-LABEL: name: test_fshl_v4s16_v4s16
+    ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
+    ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
+    ; GFX9: [[COPY2:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr4_vgpr5
+    ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+    ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
+    ; GFX9: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<4 x s16>)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[C]](s32)
+    ; GFX9: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[UV4]], [[BUILD_VECTOR_TRUNC]]
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[C1]](s32)
+    ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV4]], [[BUILD_VECTOR_TRUNC1]]
+    ; GFX9: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR_TRUNC]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[C2]](s32)
+    ; GFX9: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV]], [[AND]](<2 x s16>)
+    ; GFX9: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[UV2]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>)
+    ; GFX9: [[LSHR1:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[LSHR]], [[AND1]](<2 x s16>)
+    ; GFX9: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL]], [[LSHR1]]
+    ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX9: [[AND2:%[0-9]+]]:_(<2 x s16>) = G_AND [[UV5]], [[BUILD_VECTOR_TRUNC3]]
+    ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX9: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV5]], [[BUILD_VECTOR_TRUNC4]]
+    ; GFX9: [[AND3:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR1]], [[BUILD_VECTOR_TRUNC3]]
+    ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV1]], [[AND2]](<2 x s16>)
+    ; GFX9: [[LSHR2:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[UV3]], [[BUILD_VECTOR_TRUNC5]](<2 x s16>)
+    ; GFX9: [[LSHR3:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[LSHR2]], [[AND3]](<2 x s16>)
+    ; GFX9: [[OR1:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL1]], [[LSHR3]]
+    ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[OR]](<2 x s16>), [[OR1]](<2 x s16>)
+    ; GFX9: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    %0:_(<4 x s16>) = COPY $vgpr0_vgpr1
+    %1:_(<4 x s16>) = COPY $vgpr2_vgpr3
+    %2:_(<4 x s16>) = COPY $vgpr4_vgpr5
+    %3:_(<4 x s16>) = G_FSHL %0, %1, %2
+    $vgpr0_vgpr1 = COPY %3
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir
index b26b28f6bf34..432866061c99 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir
@@ -90,11 +90,27 @@ body: |
     ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; SI: [[FSHR:%[0-9]+]]:_(s16) = G_FSHR [[TRUNC]], [[TRUNC1]], [[TRUNC2]](s16)
-    ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSHR]](s16)
+    ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; SI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; SI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
+    ; SI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+    ; SI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C1]]
+    ; SI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C]]
+    ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C2]](s32)
+    ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
+    ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SHL]](s32)
+    ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[ZEXT]](s32)
+    ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
+    ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16)
+    ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]]
+    ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[ZEXT1]](s32)
+    ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; SI: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC1]], [[TRUNC2]]
+    ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; SI: $vgpr0 = COPY [[ANYEXT]](s32)
     ; VI-LABEL: name: test_fshr_s16_s16
     ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -103,8 +119,17 @@ body: |
     ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; VI: [[FSHR:%[0-9]+]]:_(s16) = G_FSHR [[TRUNC]], [[TRUNC1]], [[TRUNC2]](s16)
-    ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSHR]](s16)
+    ; VI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C]]
+    ; VI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+    ; VI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C1]]
+    ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C]]
+    ; VI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+    ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C2]](s16)
+    ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[SHL]], [[AND1]](s16)
+    ; VI: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[AND]](s16)
+    ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL1]], [[LSHR]]
+    ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; VI: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX9-LABEL: name: test_fshr_s16_s16
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -113,8 +138,17 @@ body: |
     ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX9: [[FSHR:%[0-9]+]]:_(s16) = G_FSHR [[TRUNC]], [[TRUNC1]], [[TRUNC2]](s16)
-    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSHR]](s16)
+    ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C]]
+    ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+    ; GFX9: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C1]]
+    ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+    ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C2]](s16)
+    ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[SHL]], [[AND1]](s16)
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[AND]](s16)
+    ; GFX9: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL1]], [[LSHR]]
+    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
@@ -137,35 +171,194 @@ body: |
     ; SI: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; SI: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
     ; SI: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; SI: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<2 x s16>)
-    ; SI: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](<2 x s16>)
-    ; SI: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](<2 x s16>)
-    ; SI: [[FSHR:%[0-9]+]]:_(s16) = G_FSHR [[UV]], [[UV2]], [[UV4]](s16)
-    ; SI: [[FSHR1:%[0-9]+]]:_(s16) = G_FSHR [[UV1]], [[UV3]], [[UV5]](s16)
-    ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FSHR]](s16), [[FSHR1]](s16)
-    ; SI: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; SI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+    ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
+    ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; SI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; SI: [[AND:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
+    ; SI: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+    ; SI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
+    ; SI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]]
+    ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16)
+    ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
+    ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[ZEXT]](s32)
+    ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
+    ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
+    ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]]
+    ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s32)
+    ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
+    ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+    ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]]
+    ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32)
+    ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
+    ; SI: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC]], [[TRUNC1]]
+    ; SI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
+    ; SI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
+    ; SI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]]
+    ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+    ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[ZEXT2]](s32)
+    ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
+    ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]]
+    ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY7]](s32)
+    ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
+    ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32)
+    ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C5]]
+    ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT3]](s32)
+    ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32)
+    ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC2]], [[TRUNC3]]
+    ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32)
+    ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+    ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY11]], [[COPY10]](s32)
+    ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32)
+    ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[COPY12]](s32)
+    ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C1]](s32)
+    ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY14]], [[SHL4]]
+    ; SI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
+    ; SI: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY2]], [[BITCAST3]]
+    ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>)
+    ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32)
+    ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32)
+    ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
+    ; SI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C2]]
+    ; SI: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C3]]
+    ; SI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]]
+    ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16)
+    ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
+    ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT]], [[ZEXT4]](s32)
+    ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32)
+    ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[SHL2]](s32)
+    ; SI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C5]]
+    ; SI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY16]](s32)
+    ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16)
+    ; SI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32)
+    ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C5]]
+    ; SI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND11]], [[ZEXT5]](s32)
+    ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32)
+    ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[TRUNC6]], [[TRUNC7]]
+    ; SI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C2]]
+    ; SI: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C3]]
+    ; SI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]]
+    ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND12]](s16)
+    ; SI: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16)
+    ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT1]], [[ZEXT6]](s32)
+    ; SI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32)
+    ; SI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; SI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[SHL3]](s32)
+    ; SI: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C5]]
+    ; SI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND14]], [[COPY19]](s32)
+    ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND13]](s16)
+    ; SI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32)
+    ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C5]]
+    ; SI: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND15]], [[ZEXT7]](s32)
+    ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32)
+    ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[TRUNC8]], [[TRUNC9]]
+    ; SI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; SI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
+    ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C1]](s32)
+    ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL7]]
+    ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
+    ; SI: $vgpr0 = COPY [[BITCAST5]](<2 x s16>)
     ; VI-LABEL: name: test_fshr_v2s16_v2s16
     ; VI: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; VI: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
     ; VI: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; VI: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<2 x s16>)
-    ; VI: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](<2 x s16>)
-    ; VI: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](<2 x s16>)
-    ; VI: [[FSHR:%[0-9]+]]:_(s16) = G_FSHR [[UV]], [[UV2]], [[UV4]](s16)
-    ; VI: [[FSHR1:%[0-9]+]]:_(s16) = G_FSHR [[UV1]], [[UV3]], [[UV5]](s16)
-    ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FSHR]](s16), [[FSHR1]](s16)
-    ; VI: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; VI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+    ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
+    ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; VI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
+    ; VI: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+    ; VI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
+    ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]]
+    ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16)
+    ; VI: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C]](s16)
+    ; VI: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[LSHR2]], [[AND1]](s16)
+    ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL]], [[LSHR3]]
+    ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
+    ; VI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
+    ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]]
+    ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16)
+    ; VI: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C]](s16)
+    ; VI: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[LSHR4]], [[AND3]](s16)
+    ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[SHL1]], [[LSHR5]]
+    ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; VI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32)
+    ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32)
+    ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC4]], [[C]](s16)
+    ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC5]], [[C]](s16)
+    ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[C4]], [[C1]](s32)
+    ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY3]], [[SHL4]]
+    ; VI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
+    ; VI: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY2]], [[BITCAST3]]
+    ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>)
+    ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32)
+    ; VI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32)
+    ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
+    ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C2]]
+    ; VI: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC6]], [[C3]]
+    ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]]
+    ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[OR]], [[AND4]](s16)
+    ; VI: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[SHL2]], [[C]](s16)
+    ; VI: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[LSHR8]], [[AND5]](s16)
+    ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[SHL5]], [[LSHR9]]
+    ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C2]]
+    ; VI: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC7]], [[C3]]
+    ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]]
+    ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[OR1]], [[AND6]](s16)
+    ; VI: [[LSHR10:%[0-9]+]]:_(s16) = G_LSHR [[SHL3]], [[C]](s16)
+    ; VI: [[LSHR11:%[0-9]+]]:_(s16) = G_LSHR [[LSHR10]], [[AND7]](s16)
+    ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[SHL6]], [[LSHR11]]
+    ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
+    ; VI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL7]]
+    ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
+    ; VI: $vgpr0 = COPY [[BITCAST5]](<2 x s16>)
     ; GFX9-LABEL: name: test_fshr_v2s16_v2s16
     ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
     ; GFX9: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; GFX9: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<2 x s16>)
-    ; GFX9: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](<2 x s16>)
-    ; GFX9: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](<2 x s16>)
-    ; GFX9: [[FSHR:%[0-9]+]]:_(s16) = G_FSHR [[UV]], [[UV2]], [[UV4]](s16)
-    ; GFX9: [[FSHR1:%[0-9]+]]:_(s16) = G_FSHR [[UV1]], [[UV3]], [[UV5]](s16)
-    ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FSHR]](s16), [[FSHR1]](s16)
-    ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[C]](s32)
+    ; GFX9: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY2]], [[BUILD_VECTOR_TRUNC]]
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[C1]](s32)
+    ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY2]], [[BUILD_VECTOR_TRUNC1]]
+    ; GFX9: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR_TRUNC]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[C2]](s32)
+    ; GFX9: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[COPY]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>)
+    ; GFX9: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[SHL]], [[AND1]](<2 x s16>)
+    ; GFX9: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[COPY1]], [[AND]](<2 x s16>)
+    ; GFX9: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL1]], [[LSHR]]
+    ; GFX9: $vgpr0 = COPY [[OR]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s16>) = COPY $vgpr1
     %2:_(<2 x s16>) = COPY $vgpr2
@@ -183,20 +376,53 @@ body: |
     ; SI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; SI: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; SI: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5
-    ; SI: [[FSHR:%[0-9]+]]:_(s64) = G_FSHR [[COPY]], [[COPY1]], [[COPY2]](s64)
-    ; SI: $vgpr0_vgpr1 = COPY [[FSHR]](s64)
+    ; SI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
+    ; SI: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[C]]
+    ; SI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; SI: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY2]], [[C1]]
+    ; SI: [[AND1:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[C]]
+    ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SI: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C2]](s32)
+    ; SI: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
+    ; SI: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SHL]], [[TRUNC]](s32)
+    ; SI: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
+    ; SI: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[TRUNC1]](s32)
+    ; SI: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[LSHR]]
+    ; SI: $vgpr0_vgpr1 = COPY [[OR]](s64)
     ; VI-LABEL: name: test_fshr_s64_s64
     ; VI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; VI: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; VI: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5
-    ; VI: [[FSHR:%[0-9]+]]:_(s64) = G_FSHR [[COPY]], [[COPY1]], [[COPY2]](s64)
-    ; VI: $vgpr0_vgpr1 = COPY [[FSHR]](s64)
+    ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
+    ; VI: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[C]]
+    ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; VI: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY2]], [[C1]]
+    ; VI: [[AND1:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[C]]
+    ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; VI: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C2]](s32)
+    ; VI: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
+    ; VI: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SHL]], [[TRUNC]](s32)
+    ; VI: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
+    ; VI: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[TRUNC1]](s32)
+    ; VI: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[LSHR]]
+    ; VI: $vgpr0_vgpr1 = COPY [[OR]](s64)
     ; GFX9-LABEL: name: test_fshr_s64_s64
     ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; GFX9: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5
-    ; GFX9: [[FSHR:%[0-9]+]]:_(s64) = G_FSHR [[COPY]], [[COPY1]], [[COPY2]](s64)
-    ; GFX9: $vgpr0_vgpr1 = COPY [[FSHR]](s64)
+    ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
+    ; GFX9: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[C]]
+    ; GFX9: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; GFX9: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY2]], [[C1]]
+    ; GFX9: [[AND1:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[C]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C2]](s32)
+    ; GFX9: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
+    ; GFX9: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SHL]], [[TRUNC]](s32)
+    ; GFX9: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[TRUNC1]](s32)
+    ; GFX9: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[LSHR]]
+    ; GFX9: $vgpr0_vgpr1 = COPY [[OR]](s64)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s64) = COPY $vgpr2_vgpr3
     %2:_(s64) = COPY $vgpr4_vgpr5
@@ -214,32 +440,115 @@ body: |
     ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; SI: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
-    ; SI: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32)
-    ; SI: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32)
-    ; SI: [[FSHR:%[0-9]+]]:_(s8) = G_FSHR [[TRUNC]], [[TRUNC1]], [[TRUNC2]](s8)
-    ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSHR]](s8)
-    ; SI: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[COPY4]]
+    ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; SI: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY5]], [[COPY6]]
+    ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[XOR]](s32)
+    ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[COPY8]]
+    ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]]
+    ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[AND2]](s32)
+    ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[AND1]](s32)
+    ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]]
+    ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[SHL]](s32)
+    ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY12]], [[AND3]](s32)
+    ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[AND]](s32)
+    ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]]
+    ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]]
+    ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[AND4]](s32)
+    ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[SHL1]](s32)
+    ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[COPY16]]
+    ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; SI: $vgpr0 = COPY [[COPY17]](s32)
     ; VI-LABEL: name: test_fshr_s8_s8
     ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; VI: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
-    ; VI: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32)
-    ; VI: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32)
-    ; VI: [[FSHR:%[0-9]+]]:_(s8) = G_FSHR [[TRUNC]], [[TRUNC1]], [[TRUNC2]](s8)
-    ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSHR]](s8)
-    ; VI: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[COPY4]]
+    ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; VI: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY5]], [[COPY6]]
+    ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[XOR]](s32)
+    ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[COPY8]]
+    ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]]
+    ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[AND2]](s32)
+    ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16)
+    ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AND1]](s32)
+    ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
+    ; VI: [[COPY11:%[0-9]+]]:_(s16) = COPY [[SHL]](s16)
+    ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[AND3]](s32)
+    ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[COPY11]], [[TRUNC2]](s16)
+    ; VI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[AND]](s32)
+    ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]]
+    ; VI: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[AND4]](s32)
+    ; VI: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND5]], [[TRUNC4]](s16)
+    ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL1]](s16)
+    ; VI: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16)
+    ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
+    ; VI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; VI: $vgpr0 = COPY [[COPY13]](s32)
     ; GFX9-LABEL: name: test_fshr_s8_s8
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
-    ; GFX9: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32)
-    ; GFX9: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32)
-    ; GFX9: [[FSHR:%[0-9]+]]:_(s8) = G_FSHR [[TRUNC]], [[TRUNC1]], [[TRUNC2]](s8)
-    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSHR]](s8)
-    ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[COPY4]]
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; GFX9: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY5]], [[COPY6]]
+    ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[XOR]](s32)
+    ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[COPY8]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]]
+    ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[AND2]](s32)
+    ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16)
+    ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AND1]](s32)
+    ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
+    ; GFX9: [[COPY11:%[0-9]+]]:_(s16) = COPY [[SHL]](s16)
+    ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[AND3]](s32)
+    ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[COPY11]], [[TRUNC2]](s16)
+    ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[AND]](s32)
+    ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]]
+    ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[AND4]](s32)
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND5]], [[TRUNC4]](s16)
+    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL1]](s16)
+    ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16)
+    ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
+    ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; GFX9: $vgpr0 = COPY [[COPY13]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = COPY $vgpr2
@@ -261,32 +570,158 @@ body: |
     ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; SI: [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[COPY]](s32)
-    ; SI: [[TRUNC1:%[0-9]+]]:_(s24) = G_TRUNC [[COPY1]](s32)
-    ; SI: [[TRUNC2:%[0-9]+]]:_(s24) = G_TRUNC [[COPY2]](s32)
-    ; SI: [[FSHR:%[0-9]+]]:_(s24) = G_FSHR [[TRUNC]], [[TRUNC1]], [[TRUNC2]](s24)
-    ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSHR]](s24)
-    ; SI: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 23
+    ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]]
+    ; SI: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; SI: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; SI: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000
+    ; SI: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]]
+    ; SI: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; SI: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[AND1]]
+    ; SI: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]]
+    ; SI: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]]
+    ; SI: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]]
+    ; SI: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]]
+    ; SI: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[AND1]]
+    ; SI: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SI: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[AND1]]
+    ; SI: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND1]]
+    ; SI: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]]
+    ; SI: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[AND1]]
+    ; SI: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]]
+    ; SI: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
+    ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; SI: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY5]], [[COPY6]]
+    ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]]
+    ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[AND2]](s32)
+    ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[SUB4]](s32)
+    ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C2]]
+    ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[SHL]](s32)
+    ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[AND3]](s32)
+    ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C2]]
+    ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C2]]
+    ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[AND4]](s32)
+    ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[SHL1]](s32)
+    ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY13]], [[COPY14]]
+    ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; SI: $vgpr0 = COPY [[COPY15]](s32)
     ; VI-LABEL: name: test_fshr_s24_s24
     ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; VI: [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[COPY]](s32)
-    ; VI: [[TRUNC1:%[0-9]+]]:_(s24) = G_TRUNC [[COPY1]](s32)
-    ; VI: [[TRUNC2:%[0-9]+]]:_(s24) = G_TRUNC [[COPY2]](s32)
-    ; VI: [[FSHR:%[0-9]+]]:_(s24) = G_FSHR [[TRUNC]], [[TRUNC1]], [[TRUNC2]](s24)
-    ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSHR]](s24)
-    ; VI: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 23
+    ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]]
+    ; VI: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; VI: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; VI: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000
+    ; VI: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]]
+    ; VI: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; VI: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[AND1]]
+    ; VI: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]]
+    ; VI: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]]
+    ; VI: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]]
+    ; VI: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]]
+    ; VI: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[AND1]]
+    ; VI: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; VI: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[AND1]]
+    ; VI: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND1]]
+    ; VI: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]]
+    ; VI: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[AND1]]
+    ; VI: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]]
+    ; VI: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
+    ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; VI: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY5]], [[COPY6]]
+    ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]]
+    ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[AND2]](s32)
+    ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[SUB4]](s32)
+    ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C2]]
+    ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[SHL]](s32)
+    ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[AND3]](s32)
+    ; VI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C2]]
+    ; VI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C2]]
+    ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[AND4]](s32)
+    ; VI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[SHL1]](s32)
+    ; VI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY13]], [[COPY14]]
+    ; VI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; VI: $vgpr0 = COPY [[COPY15]](s32)
     ; GFX9-LABEL: name: test_fshr_s24_s24
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9: [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[COPY]](s32)
-    ; GFX9: [[TRUNC1:%[0-9]+]]:_(s24) = G_TRUNC [[COPY1]](s32)
-    ; GFX9: [[TRUNC2:%[0-9]+]]:_(s24) = G_TRUNC [[COPY2]](s32)
-    ; GFX9: [[FSHR:%[0-9]+]]:_(s24) = G_FSHR [[TRUNC]], [[TRUNC1]], [[TRUNC2]](s24)
-    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSHR]](s24)
-    ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 23
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
+    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]]
+    ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[AND1]]
+    ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]]
+    ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]]
+    ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]]
+    ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]]
+    ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[AND1]]
+    ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[AND1]]
+    ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND1]]
+    ; GFX9: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]]
+    ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[AND1]]
+    ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]]
+    ; GFX9: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
+    ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; GFX9: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY5]], [[COPY6]]
+    ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]]
+    ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[AND2]](s32)
+    ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[SUB4]](s32)
+    ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C2]]
+    ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[SHL]](s32)
+    ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[AND3]](s32)
+    ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C2]]
+    ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C2]]
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[AND4]](s32)
+    ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[SHL1]](s32)
+    ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY13]], [[COPY14]]
+    ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; GFX9: $vgpr0 = COPY [[COPY15]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = COPY $vgpr2
@@ -297,3 +732,929 @@ body: |
     %7:_(s32) = G_ANYEXT %6
     $vgpr0 = COPY %7
 ...
+
+---
+name: test_fshr_v3s16_v3s16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    ; SI-LABEL: name: test_fshr_v3s16_v3s16
+    ; SI: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; SI: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; SI: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; SI: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; SI: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; SI: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; SI: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+    ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>)
+    ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>)
+    ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
+    ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>)
+    ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
+    ; SI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>)
+    ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32)
+    ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32)
+    ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
+    ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32)
+    ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]]
+    ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; SI: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+    ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32)
+    ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]]
+    ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY9]], [[C]](s32)
+    ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; SI: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
+    ; SI: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+    ; SI: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; SI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[C3]], [[C4]]
+    ; SI: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+    ; SI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C3]], [[C5]]
+    ; SI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C4]]
+    ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16)
+    ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
+    ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[ZEXT]](s32)
+    ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32)
+    ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+    ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]]
+    ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[C6]](s32)
+    ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+    ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32)
+    ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]]
+    ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[ZEXT1]](s32)
+    ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
+    ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[TRUNC]], [[TRUNC1]]
+    ; SI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[C3]], [[C4]]
+    ; SI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C3]], [[C5]]
+    ; SI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C4]]
+    ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND7]](s16)
+    ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[ZEXT2]](s32)
+    ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32)
+    ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
+    ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+    ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C1]]
+    ; SI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND9]], [[COPY14]](s32)
+    ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16)
+    ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32)
+    ; SI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C1]]
+    ; SI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[ZEXT3]](s32)
+    ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32)
+    ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[TRUNC2]], [[TRUNC3]]
+    ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
+    ; SI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+    ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[COPY17]](s32)
+    ; SI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
+    ; SI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+    ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[COPY20]], [[COPY19]](s32)
+    ; SI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; SI: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[COPY22]], [[C]](s32)
+    ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[COPY21]], [[SHL6]]
+    ; SI: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
+    ; SI: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BITCAST6]], [[BITCAST8]]
+    ; SI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>)
+    ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST9]](s32)
+    ; SI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32)
+    ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR10]](s32)
+    ; SI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C4]]
+    ; SI: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C5]]
+    ; SI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C4]]
+    ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND11]](s16)
+    ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR2]](s16)
+    ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT]], [[ZEXT4]](s32)
+    ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32)
+    ; SI: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
+    ; SI: [[COPY24:%[0-9]+]]:_(s32) = COPY [[SHL4]](s32)
+    ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C1]]
+    ; SI: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND13]], [[COPY23]](s32)
+    ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND12]](s16)
+    ; SI: [[COPY25:%[0-9]+]]:_(s32) = COPY [[LSHR11]](s32)
+    ; SI: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY25]], [[C1]]
+    ; SI: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[AND14]], [[ZEXT5]](s32)
+    ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR12]](s32)
+    ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[TRUNC6]], [[TRUNC7]]
+    ; SI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C4]]
+    ; SI: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C5]]
+    ; SI: [[AND16:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C4]]
+    ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND15]](s16)
+    ; SI: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR3]](s16)
+    ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT1]], [[ZEXT6]](s32)
+    ; SI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32)
+    ; SI: [[COPY26:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
+    ; SI: [[COPY27:%[0-9]+]]:_(s32) = COPY [[SHL5]](s32)
+    ; SI: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY27]], [[C1]]
+    ; SI: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND17]], [[COPY26]](s32)
+    ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND16]](s16)
+    ; SI: [[COPY28:%[0-9]+]]:_(s32) = COPY [[LSHR13]](s32)
+    ; SI: [[AND18:%[0-9]+]]:_(s32) = G_AND [[COPY28]], [[C1]]
+    ; SI: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[AND18]], [[ZEXT7]](s32)
+    ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR14]](s32)
+    ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[TRUNC8]], [[TRUNC9]]
+    ; SI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16)
+    ; SI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16)
+    ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C]](s32)
+    ; SI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL9]]
+    ; SI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR7]](s32)
+    ; SI: [[AND19:%[0-9]+]]:_(s16) = G_AND [[C3]], [[C4]]
+    ; SI: [[XOR5:%[0-9]+]]:_(s16) = G_XOR [[C3]], [[C5]]
+    ; SI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[XOR5]], [[C4]]
+    ; SI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[AND19]](s16)
+    ; SI: [[COPY29:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
+    ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[COPY29]], [[ZEXT10]](s32)
+    ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32)
+    ; SI: [[COPY30:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
+    ; SI: [[COPY31:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32)
+    ; SI: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY31]], [[C1]]
+    ; SI: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[AND21]], [[COPY30]](s32)
+    ; SI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[AND20]](s16)
+    ; SI: [[COPY32:%[0-9]+]]:_(s32) = COPY [[LSHR15]](s32)
+    ; SI: [[AND22:%[0-9]+]]:_(s32) = G_AND [[COPY32]], [[C1]]
+    ; SI: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[AND22]], [[ZEXT11]](s32)
+    ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR16]](s32)
+    ; SI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[TRUNC10]], [[TRUNC11]]
+    ; SI: [[AND23:%[0-9]+]]:_(s16) = G_AND [[C3]], [[C4]]
+    ; SI: [[XOR6:%[0-9]+]]:_(s16) = G_XOR [[C3]], [[C5]]
+    ; SI: [[AND24:%[0-9]+]]:_(s16) = G_AND [[XOR6]], [[C4]]
+    ; SI: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[AND23]](s16)
+    ; SI: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[DEF1]], [[ZEXT12]](s32)
+    ; SI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32)
+    ; SI: [[COPY33:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
+    ; SI: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[C2]], [[COPY33]](s32)
+    ; SI: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[AND24]](s16)
+    ; SI: [[COPY34:%[0-9]+]]:_(s32) = COPY [[LSHR17]](s32)
+    ; SI: [[AND25:%[0-9]+]]:_(s32) = G_AND [[COPY34]], [[C1]]
+    ; SI: [[LSHR18:%[0-9]+]]:_(s32) = G_LSHR [[AND25]], [[ZEXT13]](s32)
+    ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR18]](s32)
+    ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[TRUNC12]], [[TRUNC13]]
+    ; SI: [[COPY35:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
+    ; SI: [[COPY36:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32)
+    ; SI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[COPY36]], [[COPY35]](s32)
+    ; SI: [[COPY37:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
+    ; SI: [[COPY38:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+    ; SI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[COPY38]], [[COPY37]](s32)
+    ; SI: [[COPY39:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; SI: [[COPY40:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; SI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[COPY40]], [[C]](s32)
+    ; SI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[COPY39]], [[SHL14]]
+    ; SI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR10]](s32)
+    ; SI: [[XOR7:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BITCAST7]], [[BITCAST11]]
+    ; SI: [[BITCAST12:%[0-9]+]]:_(s32) = G_BITCAST [[XOR7]](<2 x s16>)
+    ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST12]](s32)
+    ; SI: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST12]], [[C]](s32)
+    ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR19]](s32)
+    ; SI: [[AND26:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C4]]
+    ; SI: [[XOR8:%[0-9]+]]:_(s16) = G_XOR [[TRUNC14]], [[C5]]
+    ; SI: [[AND27:%[0-9]+]]:_(s16) = G_AND [[XOR8]], [[C4]]
+    ; SI: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[AND26]](s16)
+    ; SI: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[OR8]](s16)
+    ; SI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT2]], [[ZEXT14]](s32)
+    ; SI: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32)
+    ; SI: [[COPY41:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
+    ; SI: [[COPY42:%[0-9]+]]:_(s32) = COPY [[SHL12]](s32)
+    ; SI: [[AND28:%[0-9]+]]:_(s32) = G_AND [[COPY42]], [[C1]]
+    ; SI: [[LSHR20:%[0-9]+]]:_(s32) = G_LSHR [[AND28]], [[COPY41]](s32)
+    ; SI: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[AND27]](s16)
+    ; SI: [[COPY43:%[0-9]+]]:_(s32) = COPY [[LSHR20]](s32)
+    ; SI: [[AND29:%[0-9]+]]:_(s32) = G_AND [[COPY43]], [[C1]]
+    ; SI: [[LSHR21:%[0-9]+]]:_(s32) = G_LSHR [[AND29]], [[ZEXT15]](s32)
+    ; SI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR21]](s32)
+    ; SI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[TRUNC16]], [[TRUNC17]]
+    ; SI: [[AND30:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C4]]
+    ; SI: [[XOR9:%[0-9]+]]:_(s16) = G_XOR [[TRUNC15]], [[C5]]
+    ; SI: [[AND31:%[0-9]+]]:_(s16) = G_AND [[XOR9]], [[C4]]
+    ; SI: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[AND30]](s16)
+    ; SI: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[OR9]](s16)
+    ; SI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT3]], [[ZEXT16]](s32)
+    ; SI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[SHL16]](s32)
+    ; SI: [[COPY44:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
+    ; SI: [[COPY45:%[0-9]+]]:_(s32) = COPY [[SHL13]](s32)
+    ; SI: [[AND32:%[0-9]+]]:_(s32) = G_AND [[COPY45]], [[C1]]
+    ; SI: [[LSHR22:%[0-9]+]]:_(s32) = G_LSHR [[AND32]], [[COPY44]](s32)
+    ; SI: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[AND31]](s16)
+    ; SI: [[COPY46:%[0-9]+]]:_(s32) = COPY [[LSHR22]](s32)
+    ; SI: [[AND33:%[0-9]+]]:_(s32) = G_AND [[COPY46]], [[C1]]
+    ; SI: [[LSHR23:%[0-9]+]]:_(s32) = G_LSHR [[AND33]], [[ZEXT17]](s32)
+    ; SI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR23]](s32)
+    ; SI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[TRUNC18]], [[TRUNC19]]
+    ; SI: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[OR11]](s16)
+    ; SI: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16)
+    ; SI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT19]], [[C]](s32)
+    ; SI: [[OR13:%[0-9]+]]:_(s32) = G_OR [[ZEXT18]], [[SHL17]]
+    ; SI: [[BITCAST13:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR13]](s32)
+    ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; SI: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; SI: [[BITCAST14:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST10]](<2 x s16>)
+    ; SI: [[LSHR24:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST14]], [[C]](s32)
+    ; SI: [[BITCAST15:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST13]](<2 x s16>)
+    ; SI: [[LSHR25:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST15]], [[C]](s32)
+    ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>)
+    ; SI: [[BITCAST16:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; SI: [[LSHR26:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST16]], [[C]](s32)
+    ; SI: [[BITCAST17:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; SI: [[LSHR27:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST17]], [[C]](s32)
+    ; SI: [[COPY47:%[0-9]+]]:_(s32) = COPY [[BITCAST14]](s32)
+    ; SI: [[AND34:%[0-9]+]]:_(s32) = G_AND [[COPY47]], [[C1]]
+    ; SI: [[COPY48:%[0-9]+]]:_(s32) = COPY [[LSHR24]](s32)
+    ; SI: [[AND35:%[0-9]+]]:_(s32) = G_AND [[COPY48]], [[C1]]
+    ; SI: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[AND35]], [[C]](s32)
+    ; SI: [[OR14:%[0-9]+]]:_(s32) = G_OR [[AND34]], [[SHL18]]
+    ; SI: [[BITCAST18:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR14]](s32)
+    ; SI: [[COPY49:%[0-9]+]]:_(s32) = COPY [[BITCAST15]](s32)
+    ; SI: [[AND36:%[0-9]+]]:_(s32) = G_AND [[COPY49]], [[C1]]
+    ; SI: [[COPY50:%[0-9]+]]:_(s32) = COPY [[BITCAST16]](s32)
+    ; SI: [[AND37:%[0-9]+]]:_(s32) = G_AND [[COPY50]], [[C1]]
+    ; SI: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[AND37]], [[C]](s32)
+    ; SI: [[OR15:%[0-9]+]]:_(s32) = G_OR [[AND36]], [[SHL19]]
+    ; SI: [[BITCAST19:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR15]](s32)
+    ; SI: [[COPY51:%[0-9]+]]:_(s32) = COPY [[LSHR26]](s32)
+    ; SI: [[AND38:%[0-9]+]]:_(s32) = G_AND [[COPY51]], [[C1]]
+    ; SI: [[COPY52:%[0-9]+]]:_(s32) = COPY [[BITCAST17]](s32)
+    ; SI: [[AND39:%[0-9]+]]:_(s32) = G_AND [[COPY52]], [[C1]]
+    ; SI: [[SHL20:%[0-9]+]]:_(s32) = G_SHL [[AND39]], [[C]](s32)
+    ; SI: [[OR16:%[0-9]+]]:_(s32) = G_OR [[AND38]], [[SHL20]]
+    ; SI: [[BITCAST20:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR16]](s32)
+    ; SI: $vgpr0 = COPY [[BITCAST18]](<2 x s16>)
+    ; SI: $vgpr1 = COPY [[BITCAST19]](<2 x s16>)
+    ; SI: $vgpr2 = COPY [[BITCAST20]](<2 x s16>)
+    ; VI-LABEL: name: test_fshr_v3s16_v3s16
+    ; VI: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; VI: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; VI: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; VI: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; VI: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; VI: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; VI: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+    ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; VI: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+    ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>)
+    ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+    ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>)
+    ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32)
+    ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
+    ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>)
+    ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
+    ; VI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>)
+    ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32)
+    ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32)
+    ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
+    ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32)
+    ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]]
+    ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; VI: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+    ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32)
+    ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]]
+    ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32)
+    ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; VI: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
+    ; VI: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+    ; VI: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[C3]], [[C4]]
+    ; VI: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+    ; VI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C3]], [[C5]]
+    ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C4]]
+    ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND3]](s16)
+    ; VI: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C3]](s16)
+    ; VI: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[LSHR6]], [[AND4]](s16)
+    ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[SHL2]], [[LSHR7]]
+    ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[C3]], [[C4]]
+    ; VI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C3]], [[C5]]
+    ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C4]]
+    ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND5]](s16)
+    ; VI: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC4]], [[C3]](s16)
+    ; VI: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[LSHR8]], [[AND6]](s16)
+    ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[SHL3]], [[LSHR9]]
+    ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C3]](s16)
+    ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[TRUNC4]], [[C3]](s16)
+    ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; VI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32)
+    ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[COPY9]], [[SHL6]]
+    ; VI: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
+    ; VI: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BITCAST6]], [[BITCAST8]]
+    ; VI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>)
+    ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST9]](s32)
+    ; VI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32)
+    ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR10]](s32)
+    ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C4]]
+    ; VI: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC6]], [[C5]]
+    ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C4]]
+    ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[OR2]], [[AND7]](s16)
+    ; VI: [[LSHR11:%[0-9]+]]:_(s16) = G_LSHR [[SHL4]], [[C3]](s16)
+    ; VI: [[LSHR12:%[0-9]+]]:_(s16) = G_LSHR [[LSHR11]], [[AND8]](s16)
+    ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[SHL7]], [[LSHR12]]
+    ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C4]]
+    ; VI: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC7]], [[C5]]
+    ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C4]]
+    ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[OR3]], [[AND9]](s16)
+    ; VI: [[LSHR13:%[0-9]+]]:_(s16) = G_LSHR [[SHL5]], [[C3]](s16)
+    ; VI: [[LSHR14:%[0-9]+]]:_(s16) = G_LSHR [[LSHR13]], [[AND10]](s16)
+    ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[SHL8]], [[LSHR14]]
+    ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16)
+    ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16)
+    ; VI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
+    ; VI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL9]]
+    ; VI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR7]](s32)
+    ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[C3]], [[C4]]
+    ; VI: [[XOR5:%[0-9]+]]:_(s16) = G_XOR [[C3]], [[C5]]
+    ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[XOR5]], [[C4]]
+    ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[AND11]](s16)
+    ; VI: [[LSHR15:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC5]], [[C3]](s16)
+    ; VI: [[LSHR16:%[0-9]+]]:_(s16) = G_LSHR [[LSHR15]], [[AND12]](s16)
+    ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[SHL10]], [[LSHR16]]
+    ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[C3]], [[C4]]
+    ; VI: [[XOR6:%[0-9]+]]:_(s16) = G_XOR [[C3]], [[C5]]
+    ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[XOR6]], [[C4]]
+    ; VI: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[DEF1]], [[AND13]](s16)
+    ; VI: [[LSHR17:%[0-9]+]]:_(s16) = G_LSHR [[DEF1]], [[C3]](s16)
+    ; VI: [[LSHR18:%[0-9]+]]:_(s16) = G_LSHR [[LSHR17]], [[AND14]](s16)
+    ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[SHL11]], [[LSHR18]]
+    ; VI: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[TRUNC5]], [[C3]](s16)
+    ; VI: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[DEF1]], [[C3]](s16)
+    ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; VI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; VI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[COPY11]], [[C]](s32)
+    ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[COPY10]], [[SHL14]]
+    ; VI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR10]](s32)
+    ; VI: [[XOR7:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BITCAST7]], [[BITCAST11]]
+    ; VI: [[BITCAST12:%[0-9]+]]:_(s32) = G_BITCAST [[XOR7]](<2 x s16>)
+    ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST12]](s32)
+    ; VI: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST12]], [[C]](s32)
+    ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR19]](s32)
+    ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C4]]
+    ; VI: [[XOR8:%[0-9]+]]:_(s16) = G_XOR [[TRUNC8]], [[C5]]
+    ; VI: [[AND16:%[0-9]+]]:_(s16) = G_AND [[XOR8]], [[C4]]
+    ; VI: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[OR8]], [[AND15]](s16)
+    ; VI: [[LSHR20:%[0-9]+]]:_(s16) = G_LSHR [[SHL12]], [[C3]](s16)
+    ; VI: [[LSHR21:%[0-9]+]]:_(s16) = G_LSHR [[LSHR20]], [[AND16]](s16)
+    ; VI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[SHL15]], [[LSHR21]]
+    ; VI: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C4]]
+    ; VI: [[XOR9:%[0-9]+]]:_(s16) = G_XOR [[TRUNC9]], [[C5]]
+    ; VI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[XOR9]], [[C4]]
+    ; VI: [[SHL16:%[0-9]+]]:_(s16) = G_SHL [[OR9]], [[AND17]](s16)
+    ; VI: [[LSHR22:%[0-9]+]]:_(s16) = G_LSHR [[SHL13]], [[C3]](s16)
+    ; VI: [[LSHR23:%[0-9]+]]:_(s16) = G_LSHR [[LSHR22]], [[AND18]](s16)
+    ; VI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[SHL16]], [[LSHR23]]
+    ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR11]](s16)
+    ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16)
+    ; VI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32)
+    ; VI: [[OR13:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL17]]
+    ; VI: [[BITCAST13:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR13]](s32)
+    ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; VI: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; VI: [[BITCAST14:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST10]](<2 x s16>)
+    ; VI: [[LSHR24:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST14]], [[C]](s32)
+    ; VI: [[BITCAST15:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST13]](<2 x s16>)
+    ; VI: [[LSHR25:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST15]], [[C]](s32)
+    ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>)
+    ; VI: [[BITCAST16:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; VI: [[LSHR26:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST16]], [[C]](s32)
+    ; VI: [[BITCAST17:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; VI: [[LSHR27:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST17]], [[C]](s32)
+    ; VI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST14]](s32)
+    ; VI: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]]
+    ; VI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR24]](s32)
+    ; VI: [[AND20:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]]
+    ; VI: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[AND20]], [[C]](s32)
+    ; VI: [[OR14:%[0-9]+]]:_(s32) = G_OR [[AND19]], [[SHL18]]
+    ; VI: [[BITCAST18:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR14]](s32)
+    ; VI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST15]](s32)
+    ; VI: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C1]]
+    ; VI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[BITCAST16]](s32)
+    ; VI: [[AND22:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C1]]
+    ; VI: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[AND22]], [[C]](s32)
+    ; VI: [[OR15:%[0-9]+]]:_(s32) = G_OR [[AND21]], [[SHL19]]
+    ; VI: [[BITCAST19:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR15]](s32)
+    ; VI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR26]](s32)
+    ; VI: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C1]]
+    ; VI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[BITCAST17]](s32)
+    ; VI: [[AND24:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C1]]
+    ; VI: [[SHL20:%[0-9]+]]:_(s32) = G_SHL [[AND24]], [[C]](s32)
+    ; VI: [[OR16:%[0-9]+]]:_(s32) = G_OR [[AND23]], [[SHL20]]
+    ; VI: [[BITCAST20:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR16]](s32)
+    ; VI: $vgpr0 = COPY [[BITCAST18]](<2 x s16>)
+    ; VI: $vgpr1 = COPY [[BITCAST19]](<2 x s16>)
+    ; VI: $vgpr2 = COPY [[BITCAST20]](<2 x s16>)
+    ; GFX9-LABEL: name: test_fshr_v3s16_v3s16
+    ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX9: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; GFX9: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; GFX9: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; GFX9: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+    ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
+    ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
+    ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>)
+    ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>)
+    ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
+    ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+    ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32)
+    ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
+    ; GFX9: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>)
+    ; GFX9: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
+    ; GFX9: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>)
+    ; GFX9: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32)
+    ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32)
+    ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
+    ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF1]](s32)
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[C1]](s32)
+    ; GFX9: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[BUILD_VECTOR_TRUNC4]], [[BUILD_VECTOR_TRUNC6]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[C2]](s32)
+    ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BUILD_VECTOR_TRUNC4]], [[BUILD_VECTOR_TRUNC7]]
+    ; GFX9: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR_TRUNC6]]
+    ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[C3]](s32)
+    ; GFX9: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC8]](<2 x s16>)
+    ; GFX9: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[SHL]], [[AND1]](<2 x s16>)
+    ; GFX9: [[LSHR6:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR_TRUNC2]], [[AND]](<2 x s16>)
+    ; GFX9: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL1]], [[LSHR6]]
+    ; GFX9: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; GFX9: [[COPY21:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32)
+    ; GFX9: [[AND2:%[0-9]+]]:_(<2 x s16>) = G_AND [[BUILD_VECTOR_TRUNC5]], [[BUILD_VECTOR_TRUNC9]]
+    ; GFX9: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; GFX9: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC10:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY22]](s32), [[COPY23]](s32)
+    ; GFX9: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BUILD_VECTOR_TRUNC5]], [[BUILD_VECTOR_TRUNC10]]
+    ; GFX9: [[AND3:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR1]], [[BUILD_VECTOR_TRUNC9]]
+    ; GFX9: [[COPY24:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; GFX9: [[COPY25:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC11:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY24]](s32), [[COPY25]](s32)
+    ; GFX9: [[SHL2:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC11]](<2 x s16>)
+    ; GFX9: [[SHL3:%[0-9]+]]:_(<2 x s16>) = G_SHL [[SHL2]], [[AND3]](<2 x s16>)
+    ; GFX9: [[LSHR7:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR_TRUNC3]], [[AND2]](<2 x s16>)
+    ; GFX9: [[OR1:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL3]], [[LSHR7]]
+    ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX9: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX9: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[OR]](<2 x s16>)
+    ; GFX9: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32)
+    ; GFX9: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[OR1]](<2 x s16>)
+    ; GFX9: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32)
+    ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>)
+    ; GFX9: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX9: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32)
+    ; GFX9: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX9: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32)
+    ; GFX9: [[COPY26:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32)
+    ; GFX9: [[COPY27:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC12:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY26]](s32), [[COPY27]](s32)
+    ; GFX9: [[COPY28:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32)
+    ; GFX9: [[COPY29:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC13:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY28]](s32), [[COPY29]](s32)
+    ; GFX9: [[COPY30:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32)
+    ; GFX9: [[COPY31:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC14:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY30]](s32), [[COPY31]](s32)
+    ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC12]](<2 x s16>)
+    ; GFX9: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC13]](<2 x s16>)
+    ; GFX9: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC14]](<2 x s16>)
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $vgpr1
+    %2:_(<2 x s16>) = COPY $vgpr2
+    %3:_(<2 x s16>) = COPY $vgpr3
+    %4:_(<2 x s16>) = COPY $vgpr4
+    %5:_(<2 x s16>) = COPY $vgpr5
+    %6:_(<2 x s16>) = G_IMPLICIT_DEF
+    %7:_(<6 x s16>) = G_CONCAT_VECTORS %0(<2 x s16>), %1(<2 x s16>), %6(<2 x s16>)
+    %8:_(<3 x s16>), %9:_(<3 x s16>) = G_UNMERGE_VALUES %7(<6 x s16>)
+    %10:_(<6 x s16>) = G_CONCAT_VECTORS %2(<2 x s16>), %3(<2 x s16>), %6(<2 x s16>)
+    %11:_(<3 x s16>), %12:_(<3 x s16>) = G_UNMERGE_VALUES %10(<6 x s16>)
+    %13:_(<6 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>), %6(<2 x s16>)
+    %14:_(<3 x s16>), %15:_(<3 x s16>) = G_UNMERGE_VALUES %13(<6 x s16>)
+    %16:_(<3 x s16>) = G_FSHR %8, %11, %14(<3 x s16>)
+    %17:_(<3 x s16>) = G_IMPLICIT_DEF
+    %18:_(<6 x s16>) = G_CONCAT_VECTORS %16(<3 x s16>), %17(<3 x s16>)
+    %19:_(<2 x s16>), %20:_(<2 x s16>), %21:_(<2 x s16>) = G_UNMERGE_VALUES %18(<6 x s16>)
+    $vgpr0 = COPY %19(<2 x s16>)
+    $vgpr1 = COPY %20(<2 x s16>)
+    $vgpr2 = COPY %21(<2 x s16>)
+...
+
+---
+name: test_fshr_v4s16_v4s16
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
+
+    ; SI-LABEL: name: test_fshr_v4s16_v4s16
+    ; SI: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
+    ; SI: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
+    ; SI: [[COPY2:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr4_vgpr5
+    ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+    ; SI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
+    ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<4 x s16>)
+    ; SI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+    ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
+    ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
+    ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; SI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; SI: [[AND:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
+    ; SI: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+    ; SI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
+    ; SI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]]
+    ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16)
+    ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
+    ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[ZEXT]](s32)
+    ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
+    ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
+    ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]]
+    ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s32)
+    ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
+    ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+    ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]]
+    ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32)
+    ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
+    ; SI: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC]], [[TRUNC1]]
+    ; SI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
+    ; SI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
+    ; SI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]]
+    ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+    ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[ZEXT2]](s32)
+    ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
+    ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]]
+    ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY7]](s32)
+    ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
+    ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32)
+    ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C5]]
+    ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT3]](s32)
+    ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32)
+    ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC2]], [[TRUNC3]]
+    ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
+    ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32)
+    ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32)
+    ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY11]], [[COPY10]](s32)
+    ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32)
+    ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[COPY12]](s32)
+    ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C1]](s32)
+    ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY14]], [[SHL4]]
+    ; SI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
+    ; SI: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV4]], [[BITCAST3]]
+    ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>)
+    ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32)
+    ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32)
+    ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
+    ; SI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C2]]
+    ; SI: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C3]]
+    ; SI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]]
+    ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16)
+    ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
+    ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT]], [[ZEXT4]](s32)
+    ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32)
+    ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[SHL2]](s32)
+    ; SI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C5]]
+    ; SI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY16]](s32)
+    ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16)
+    ; SI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32)
+    ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C5]]
+    ; SI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND11]], [[ZEXT5]](s32)
+    ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32)
+    ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[TRUNC6]], [[TRUNC7]]
+    ; SI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C2]]
+    ; SI: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C3]]
+    ; SI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]]
+    ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND12]](s16)
+    ; SI: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16)
+    ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT1]], [[ZEXT6]](s32)
+    ; SI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32)
+    ; SI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; SI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[SHL3]](s32)
+    ; SI: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C5]]
+    ; SI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND14]], [[COPY19]](s32)
+    ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND13]](s16)
+    ; SI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32)
+    ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C5]]
+    ; SI: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND15]], [[ZEXT7]](s32)
+    ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32)
+    ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[TRUNC8]], [[TRUNC9]]
+    ; SI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; SI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
+    ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C1]](s32)
+    ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL7]]
+    ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
+    ; SI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; SI: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C1]](s32)
+    ; SI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+    ; SI: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C1]](s32)
+    ; SI: [[AND16:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
+    ; SI: [[XOR5:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
+    ; SI: [[AND17:%[0-9]+]]:_(s16) = G_AND [[XOR5]], [[C2]]
+    ; SI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[AND16]](s16)
+    ; SI: [[COPY22:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32)
+    ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[COPY22]], [[ZEXT10]](s32)
+    ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32)
+    ; SI: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; SI: [[COPY24:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32)
+    ; SI: [[AND18:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C5]]
+    ; SI: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[AND18]], [[COPY23]](s32)
+    ; SI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[AND17]](s16)
+    ; SI: [[COPY25:%[0-9]+]]:_(s32) = COPY [[LSHR14]](s32)
+    ; SI: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY25]], [[C5]]
+    ; SI: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[AND19]], [[ZEXT11]](s32)
+    ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR15]](s32)
+    ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[TRUNC10]], [[TRUNC11]]
+    ; SI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
+    ; SI: [[XOR6:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
+    ; SI: [[AND21:%[0-9]+]]:_(s16) = G_AND [[XOR6]], [[C2]]
+    ; SI: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[AND20]](s16)
+    ; SI: [[COPY26:%[0-9]+]]:_(s32) = COPY [[LSHR12]](s32)
+    ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[COPY26]], [[ZEXT12]](s32)
+    ; SI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32)
+    ; SI: [[COPY27:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; SI: [[COPY28:%[0-9]+]]:_(s32) = COPY [[LSHR13]](s32)
+    ; SI: [[AND22:%[0-9]+]]:_(s32) = G_AND [[COPY28]], [[C5]]
+    ; SI: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[AND22]], [[COPY27]](s32)
+    ; SI: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[AND21]](s16)
+    ; SI: [[COPY29:%[0-9]+]]:_(s32) = COPY [[LSHR16]](s32)
+    ; SI: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY29]], [[C5]]
+    ; SI: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[AND23]], [[ZEXT13]](s32)
+    ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR17]](s32)
+    ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[TRUNC12]], [[TRUNC13]]
+    ; SI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+    ; SI: [[LSHR18:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C1]](s32)
+    ; SI: [[COPY30:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; SI: [[COPY31:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32)
+    ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[COPY31]], [[COPY30]](s32)
+    ; SI: [[COPY32:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; SI: [[COPY33:%[0-9]+]]:_(s32) = COPY [[LSHR18]](s32)
+    ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[COPY33]], [[COPY32]](s32)
+    ; SI: [[COPY34:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; SI: [[COPY35:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; SI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[COPY35]], [[C1]](s32)
+    ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[COPY34]], [[SHL12]]
+    ; SI: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR8]](s32)
+    ; SI: [[XOR7:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV5]], [[BITCAST9]]
+    ; SI: [[BITCAST10:%[0-9]+]]:_(s32) = G_BITCAST [[XOR7]](<2 x s16>)
+    ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST10]](s32)
+    ; SI: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C1]](s32)
+    ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR19]](s32)
+    ; SI: [[AND24:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C2]]
+    ; SI: [[XOR8:%[0-9]+]]:_(s16) = G_XOR [[TRUNC14]], [[C3]]
+    ; SI: [[AND25:%[0-9]+]]:_(s16) = G_AND [[XOR8]], [[C2]]
+    ; SI: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[AND24]](s16)
+    ; SI: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[OR6]](s16)
+    ; SI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT2]], [[ZEXT14]](s32)
+    ; SI: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32)
+    ; SI: [[COPY36:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; SI: [[COPY37:%[0-9]+]]:_(s32) = COPY [[SHL10]](s32)
+    ; SI: [[AND26:%[0-9]+]]:_(s32) = G_AND [[COPY37]], [[C5]]
+    ; SI: [[LSHR20:%[0-9]+]]:_(s32) = G_LSHR [[AND26]], [[COPY36]](s32)
+    ; SI: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[AND25]](s16)
+    ; SI: [[COPY38:%[0-9]+]]:_(s32) = COPY [[LSHR20]](s32)
+    ; SI: [[AND27:%[0-9]+]]:_(s32) = G_AND [[COPY38]], [[C5]]
+    ; SI: [[LSHR21:%[0-9]+]]:_(s32) = G_LSHR [[AND27]], [[ZEXT15]](s32)
+    ; SI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR21]](s32)
+    ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[TRUNC16]], [[TRUNC17]]
+    ; SI: [[AND28:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C2]]
+    ; SI: [[XOR9:%[0-9]+]]:_(s16) = G_XOR [[TRUNC15]], [[C3]]
+    ; SI: [[AND29:%[0-9]+]]:_(s16) = G_AND [[XOR9]], [[C2]]
+    ; SI: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[AND28]](s16)
+    ; SI: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[OR7]](s16)
+    ; SI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT3]], [[ZEXT16]](s32)
+    ; SI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32)
+    ; SI: [[COPY39:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; SI: [[COPY40:%[0-9]+]]:_(s32) = COPY [[SHL11]](s32)
+    ; SI: [[AND30:%[0-9]+]]:_(s32) = G_AND [[COPY40]], [[C5]]
+    ; SI: [[LSHR22:%[0-9]+]]:_(s32) = G_LSHR [[AND30]], [[COPY39]](s32)
+    ; SI: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[AND29]](s16)
+    ; SI: [[COPY41:%[0-9]+]]:_(s32) = COPY [[LSHR22]](s32)
+    ; SI: [[AND31:%[0-9]+]]:_(s32) = G_AND [[COPY41]], [[C5]]
+    ; SI: [[LSHR23:%[0-9]+]]:_(s32) = G_LSHR [[AND31]], [[ZEXT17]](s32)
+    ; SI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR23]](s32)
+    ; SI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[TRUNC18]], [[TRUNC19]]
+    ; SI: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16)
+    ; SI: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16)
+    ; SI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXT19]], [[C1]](s32)
+    ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT18]], [[SHL15]]
+    ; SI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR11]](s32)
+    ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST11]](<2 x s16>)
+    ; SI: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; VI-LABEL: name: test_fshr_v4s16_v4s16
+    ; VI: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
+    ; VI: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
+    ; VI: [[COPY2:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr4_vgpr5
+    ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+    ; VI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
+    ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<4 x s16>)
+    ; VI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+    ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
+    ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
+    ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; VI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
+    ; VI: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+    ; VI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
+    ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]]
+    ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16)
+    ; VI: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C]](s16)
+    ; VI: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[LSHR2]], [[AND1]](s16)
+    ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL]], [[LSHR3]]
+    ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
+    ; VI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
+    ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]]
+    ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16)
+    ; VI: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C]](s16)
+    ; VI: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[LSHR4]], [[AND3]](s16)
+    ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[SHL1]], [[LSHR5]]
+    ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
+    ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; VI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32)
+    ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32)
+    ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC4]], [[C]](s16)
+    ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC5]], [[C]](s16)
+    ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[C4]], [[C1]](s32)
+    ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY3]], [[SHL4]]
+    ; VI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
+    ; VI: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV4]], [[BITCAST3]]
+    ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>)
+    ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32)
+    ; VI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32)
+    ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
+    ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C2]]
+    ; VI: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC6]], [[C3]]
+    ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]]
+    ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[OR]], [[AND4]](s16)
+    ; VI: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[SHL2]], [[C]](s16)
+    ; VI: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[LSHR8]], [[AND5]](s16)
+    ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[SHL5]], [[LSHR9]]
+    ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C2]]
+    ; VI: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC7]], [[C3]]
+    ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]]
+    ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[OR1]], [[AND6]](s16)
+    ; VI: [[LSHR10:%[0-9]+]]:_(s16) = G_LSHR [[SHL3]], [[C]](s16)
+    ; VI: [[LSHR11:%[0-9]+]]:_(s16) = G_LSHR [[LSHR10]], [[AND7]](s16)
+    ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[SHL6]], [[LSHR11]]
+    ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
+    ; VI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL7]]
+    ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
+    ; VI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32)
+    ; VI: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C1]](s32)
+    ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR12]](s32)
+    ; VI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+    ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST7]](s32)
+    ; VI: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C1]](s32)
+    ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR13]](s32)
+    ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
+    ; VI: [[XOR5:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
+    ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR5]], [[C2]]
+    ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[TRUNC8]], [[AND8]](s16)
+    ; VI: [[LSHR14:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC10]], [[C]](s16)
+    ; VI: [[LSHR15:%[0-9]+]]:_(s16) = G_LSHR [[LSHR14]], [[AND9]](s16)
+    ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[SHL8]], [[LSHR15]]
+    ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
+    ; VI: [[XOR6:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
+    ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[XOR6]], [[C2]]
+    ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[TRUNC9]], [[AND10]](s16)
+    ; VI: [[LSHR16:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC11]], [[C]](s16)
+    ; VI: [[LSHR17:%[0-9]+]]:_(s16) = G_LSHR [[LSHR16]], [[AND11]](s16)
+    ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[SHL9]], [[LSHR17]]
+    ; VI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+    ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST8]](s32)
+    ; VI: [[LSHR18:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C1]](s32)
+    ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR18]](s32)
+    ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[TRUNC12]], [[C]](s16)
+    ; VI: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[TRUNC13]], [[C]](s16)
+    ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; VI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C1]](s32)
+    ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[COPY4]], [[SHL12]]
+    ; VI: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR8]](s32)
+    ; VI: [[XOR7:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV5]], [[BITCAST9]]
+    ; VI: [[BITCAST10:%[0-9]+]]:_(s32) = G_BITCAST [[XOR7]](<2 x s16>)
+    ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST10]](s32)
+    ; VI: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C1]](s32)
+    ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR19]](s32)
+    ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C2]]
+    ; VI: [[XOR8:%[0-9]+]]:_(s16) = G_XOR [[TRUNC14]], [[C3]]
+    ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[XOR8]], [[C2]]
+    ; VI: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[OR6]], [[AND12]](s16)
+    ; VI: [[LSHR20:%[0-9]+]]:_(s16) = G_LSHR [[SHL10]], [[C]](s16)
+    ; VI: [[LSHR21:%[0-9]+]]:_(s16) = G_LSHR [[LSHR20]], [[AND13]](s16)
+    ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[SHL13]], [[LSHR21]]
+    ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C2]]
+    ; VI: [[XOR9:%[0-9]+]]:_(s16) = G_XOR [[TRUNC15]], [[C3]]
+    ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[XOR9]], [[C2]]
+    ; VI: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[OR7]], [[AND14]](s16)
+    ; VI: [[LSHR22:%[0-9]+]]:_(s16) = G_LSHR [[SHL11]], [[C]](s16)
+    ; VI: [[LSHR23:%[0-9]+]]:_(s16) = G_LSHR [[LSHR22]], [[AND15]](s16)
+    ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[SHL14]], [[LSHR23]]
+    ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16)
+    ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16)
+    ; VI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C1]](s32)
+    ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL15]]
+    ; VI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR11]](s32)
+    ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST11]](<2 x s16>)
+    ; VI: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX9-LABEL: name: test_fshr_v4s16_v4s16
+    ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
+    ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
+    ; GFX9: [[COPY2:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr4_vgpr5
+    ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+    ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
+    ; GFX9: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<4 x s16>)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[C]](s32)
+    ; GFX9: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[UV4]], [[BUILD_VECTOR_TRUNC]]
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[C1]](s32)
+    ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV4]], [[BUILD_VECTOR_TRUNC1]]
+    ; GFX9: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR_TRUNC]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[C2]](s32)
+    ; GFX9: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>)
+    ; GFX9: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[SHL]], [[AND1]](<2 x s16>)
+    ; GFX9: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[UV2]], [[AND]](<2 x s16>)
+    ; GFX9: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL1]], [[LSHR]]
+    ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX9: [[AND2:%[0-9]+]]:_(<2 x s16>) = G_AND [[UV5]], [[BUILD_VECTOR_TRUNC3]]
+    ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX9: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV5]], [[BUILD_VECTOR_TRUNC4]]
+    ; GFX9: [[AND3:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR1]], [[BUILD_VECTOR_TRUNC3]]
+    ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9: [[SHL2:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV1]], [[BUILD_VECTOR_TRUNC5]](<2 x s16>)
+    ; GFX9: [[SHL3:%[0-9]+]]:_(<2 x s16>) = G_SHL [[SHL2]], [[AND3]](<2 x s16>)
+    ; GFX9: [[LSHR1:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[UV3]], [[AND2]](<2 x s16>)
+    ; GFX9: [[OR1:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL3]], [[LSHR1]]
+    ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[OR]](<2 x s16>), [[OR1]](<2 x s16>)
+    ; GFX9: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    %0:_(<4 x s16>) = COPY $vgpr0_vgpr1
+    %1:_(<4 x s16>) = COPY $vgpr2_vgpr3
+    %2:_(<4 x s16>) = COPY $vgpr4_vgpr5
+    %3:_(<4 x s16>) = G_FSHR %0, %1, %2
+    $vgpr0_vgpr1 = COPY %3
+...


        


More information about the llvm-commits mailing list