[llvm] 64bef13 - [GlobalISel] Look through truncs and extends in narrowScalarShift
Konstantin Schwarz via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 10 04:50:11 PDT 2021
Author: Konstantin Schwarz
Date: 2021-08-10T13:49:22+02:00
New Revision: 64bef13f083b3d9808beb9456c5bf65434ff748d
URL: https://github.com/llvm/llvm-project/commit/64bef13f083b3d9808beb9456c5bf65434ff748d
DIFF: https://github.com/llvm/llvm-project/commit/64bef13f083b3d9808beb9456c5bf65434ff748d.diff
LOG: [GlobalISel] Look through truncs and extends in narrowScalarShift
If a G_SHL is fed by a G_CONSTANT, the lower and upper bits of the source can be
shifted individually by the constant shift amount.
However in case the shift amount came from a G_TRUNC(G_CONSTANT), the generic shift legalization
code was used, producing intermediate shifts that are potentially illegal on some targets.
This change teaches narrowScalarShift to look through G_TRUNCs and G_*EXTs.
Reviewed By: paquette
Differential Revision: https://reviews.llvm.org/D89100
Added:
Modified:
llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
llvm/test/CodeGen/AArch64/GlobalISel/legalize-and.mir
llvm/test/CodeGen/AArch64/GlobalISel/legalize-bswap.mir
llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 08d440be01117..e12b3ed2b70ca 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4877,10 +4877,10 @@ LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
const LLT HalfTy = LLT::scalar(NewBitSize);
const LLT CondTy = LLT::scalar(1);
- if (const MachineInstr *KShiftAmt =
- getOpcodeDef(TargetOpcode::G_CONSTANT, Amt, MRI)) {
- return narrowScalarShiftByConstant(
- MI, KShiftAmt->getOperand(1).getCImm()->getValue(), HalfTy, ShiftAmtTy);
+ if (auto VRegAndVal =
+ getConstantVRegValWithLookThrough(Amt, MRI, true, false)) {
+ return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
+ ShiftAmtTy);
}
// TODO: Expand with known bits.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
index c0f09428c3799..4676592bc9718 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
@@ -373,112 +373,60 @@ define void @atomic_load_relaxed(i64, i64, i128* %p, i128* %p2) {
;
; CHECK-LLSC-O0-LABEL: atomic_load_relaxed:
; CHECK-LLSC-O0: // %bb.0:
-; CHECK-LLSC-O0-NEXT: sub sp, sp, #64
-; CHECK-LLSC-O0-NEXT: .cfi_def_cfa_offset 64
-; CHECK-LLSC-O0-NEXT: str x2, [sp, #48] // 8-byte Folded Spill
-; CHECK-LLSC-O0-NEXT: str x3, [sp, #56] // 8-byte Folded Spill
+; CHECK-LLSC-O0-NEXT: sub sp, sp, #48
+; CHECK-LLSC-O0-NEXT: .cfi_def_cfa_offset 48
+; CHECK-LLSC-O0-NEXT: str x2, [sp, #32] // 8-byte Folded Spill
+; CHECK-LLSC-O0-NEXT: str x3, [sp, #40] // 8-byte Folded Spill
; CHECK-LLSC-O0-NEXT: b .LBB4_1
; CHECK-LLSC-O0-NEXT: .LBB4_1: // %atomicrmw.start
; CHECK-LLSC-O0-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-LLSC-O0-NEXT: ldr x11, [sp, #48] // 8-byte Folded Reload
-; CHECK-LLSC-O0-NEXT: ldxp x9, x15, [x11]
-; CHECK-LLSC-O0-NEXT: mov x12, xzr
-; CHECK-LLSC-O0-NEXT: mov w8, #64
-; CHECK-LLSC-O0-NEXT: // kill: def $x8 killed $w8
-; CHECK-LLSC-O0-NEXT: mov w10, #64
-; CHECK-LLSC-O0-NEXT: // kill: def $x10 killed $w10
-; CHECK-LLSC-O0-NEXT: str x10, [sp, #8] // 8-byte Folded Spill
-; CHECK-LLSC-O0-NEXT: subs x16, x10, #64
-; CHECK-LLSC-O0-NEXT: subs x13, x8, #64
-; CHECK-LLSC-O0-NEXT: lsl x14, x15, x10
-; CHECK-LLSC-O0-NEXT: lsr x13, x15, x13
-; CHECK-LLSC-O0-NEXT: orr x13, x13, x12
-; CHECK-LLSC-O0-NEXT: lsl x15, x15, x16
-; CHECK-LLSC-O0-NEXT: subs x16, x10, #64
-; CHECK-LLSC-O0-NEXT: csel x14, x14, x12, lo
-; CHECK-LLSC-O0-NEXT: subs x16, x10, #64
-; CHECK-LLSC-O0-NEXT: csel x13, x13, x15, lo
-; CHECK-LLSC-O0-NEXT: subs x15, x10, #0
-; CHECK-LLSC-O0-NEXT: csel x13, x12, x13, eq
-; CHECK-LLSC-O0-NEXT: orr x9, x9, x14
-; CHECK-LLSC-O0-NEXT: orr x12, x12, x13
+; CHECK-LLSC-O0-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
+; CHECK-LLSC-O0-NEXT: ldxp x9, x10, [x11]
+; CHECK-LLSC-O0-NEXT: mov x8, xzr
+; CHECK-LLSC-O0-NEXT: orr x9, x9, x8
+; CHECK-LLSC-O0-NEXT: orr x10, x8, x10
; CHECK-LLSC-O0-NEXT: // implicit-def: $q0
; CHECK-LLSC-O0-NEXT: mov v0.d[0], x9
+; CHECK-LLSC-O0-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-LLSC-O0-NEXT: mov v0.d[1], x10
; CHECK-LLSC-O0-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-LLSC-O0-NEXT: mov v0.d[1], x12
-; CHECK-LLSC-O0-NEXT: str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-LLSC-O0-NEXT: subs x13, x10, #64
-; CHECK-LLSC-O0-NEXT: subs x8, x8, #64
-; CHECK-LLSC-O0-NEXT: lsl x8, x12, x8
-; CHECK-LLSC-O0-NEXT: orr x8, x8, x9, lsr #0
-; CHECK-LLSC-O0-NEXT: lsr x12, x12, x13
-; CHECK-LLSC-O0-NEXT: subs x13, x10, #64
-; CHECK-LLSC-O0-NEXT: csel x8, x8, x12, lo
-; CHECK-LLSC-O0-NEXT: subs x10, x10, #0
-; CHECK-LLSC-O0-NEXT: csel x10, x9, x8, eq
; CHECK-LLSC-O0-NEXT: stxp w8, x9, x10, [x11]
; CHECK-LLSC-O0-NEXT: cbnz w8, .LBB4_1
; CHECK-LLSC-O0-NEXT: b .LBB4_2
; CHECK-LLSC-O0-NEXT: .LBB4_2: // %atomicrmw.end
-; CHECK-LLSC-O0-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-LLSC-O0-NEXT: ldr x8, [sp, #56] // 8-byte Folded Reload
+; CHECK-LLSC-O0-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-LLSC-O0-NEXT: ldr x8, [sp, #40] // 8-byte Folded Reload
; CHECK-LLSC-O0-NEXT: str q0, [x8]
-; CHECK-LLSC-O0-NEXT: add sp, sp, #64
+; CHECK-LLSC-O0-NEXT: add sp, sp, #48
; CHECK-LLSC-O0-NEXT: ret
;
; CHECK-CAS-O0-LABEL: atomic_load_relaxed:
; CHECK-CAS-O0: // %bb.0:
-; CHECK-CAS-O0-NEXT: sub sp, sp, #64
-; CHECK-CAS-O0-NEXT: .cfi_def_cfa_offset 64
-; CHECK-CAS-O0-NEXT: str x2, [sp, #48] // 8-byte Folded Spill
-; CHECK-CAS-O0-NEXT: str x3, [sp, #56] // 8-byte Folded Spill
+; CHECK-CAS-O0-NEXT: sub sp, sp, #48
+; CHECK-CAS-O0-NEXT: .cfi_def_cfa_offset 48
+; CHECK-CAS-O0-NEXT: str x2, [sp, #32] // 8-byte Folded Spill
+; CHECK-CAS-O0-NEXT: str x3, [sp, #40] // 8-byte Folded Spill
; CHECK-CAS-O0-NEXT: b .LBB4_1
; CHECK-CAS-O0-NEXT: .LBB4_1: // %atomicrmw.start
; CHECK-CAS-O0-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-CAS-O0-NEXT: ldr x11, [sp, #48] // 8-byte Folded Reload
-; CHECK-CAS-O0-NEXT: ldxp x9, x15, [x11]
-; CHECK-CAS-O0-NEXT: mov x12, #0
-; CHECK-CAS-O0-NEXT: mov w8, #64
-; CHECK-CAS-O0-NEXT: // kill: def $x8 killed $w8
-; CHECK-CAS-O0-NEXT: mov w10, #64
-; CHECK-CAS-O0-NEXT: // kill: def $x10 killed $w10
-; CHECK-CAS-O0-NEXT: str x10, [sp, #8] // 8-byte Folded Spill
-; CHECK-CAS-O0-NEXT: subs x16, x10, #64
-; CHECK-CAS-O0-NEXT: subs x13, x8, #64
-; CHECK-CAS-O0-NEXT: lsl x14, x15, x10
-; CHECK-CAS-O0-NEXT: lsr x13, x15, x13
-; CHECK-CAS-O0-NEXT: orr x13, x13, x12
-; CHECK-CAS-O0-NEXT: lsl x15, x15, x16
-; CHECK-CAS-O0-NEXT: subs x16, x10, #64
-; CHECK-CAS-O0-NEXT: csel x14, x14, x12, lo
-; CHECK-CAS-O0-NEXT: subs x16, x10, #64
-; CHECK-CAS-O0-NEXT: csel x13, x13, x15, lo
-; CHECK-CAS-O0-NEXT: subs x15, x10, #0
-; CHECK-CAS-O0-NEXT: csel x13, x12, x13, eq
-; CHECK-CAS-O0-NEXT: orr x9, x9, x14
-; CHECK-CAS-O0-NEXT: orr x12, x12, x13
+; CHECK-CAS-O0-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
+; CHECK-CAS-O0-NEXT: ldxp x9, x10, [x11]
+; CHECK-CAS-O0-NEXT: mov x8, #0
+; CHECK-CAS-O0-NEXT: orr x9, x9, x8
+; CHECK-CAS-O0-NEXT: orr x10, x8, x10
; CHECK-CAS-O0-NEXT: // implicit-def: $q0
; CHECK-CAS-O0-NEXT: mov v0.d[0], x9
+; CHECK-CAS-O0-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-CAS-O0-NEXT: mov v0.d[1], x10
; CHECK-CAS-O0-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-CAS-O0-NEXT: mov v0.d[1], x12
-; CHECK-CAS-O0-NEXT: str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-CAS-O0-NEXT: subs x13, x10, #64
-; CHECK-CAS-O0-NEXT: subs x8, x8, #64
-; CHECK-CAS-O0-NEXT: lsl x8, x12, x8
-; CHECK-CAS-O0-NEXT: orr x8, x8, x9, lsr #0
-; CHECK-CAS-O0-NEXT: lsr x12, x12, x13
-; CHECK-CAS-O0-NEXT: subs x13, x10, #64
-; CHECK-CAS-O0-NEXT: csel x8, x8, x12, lo
-; CHECK-CAS-O0-NEXT: subs x10, x10, #0
-; CHECK-CAS-O0-NEXT: csel x10, x9, x8, eq
; CHECK-CAS-O0-NEXT: stxp w8, x9, x10, [x11]
; CHECK-CAS-O0-NEXT: cbnz w8, .LBB4_1
; CHECK-CAS-O0-NEXT: b .LBB4_2
; CHECK-CAS-O0-NEXT: .LBB4_2: // %atomicrmw.end
-; CHECK-CAS-O0-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-CAS-O0-NEXT: ldr x8, [sp, #56] // 8-byte Folded Reload
+; CHECK-CAS-O0-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-CAS-O0-NEXT: ldr x8, [sp, #40] // 8-byte Folded Reload
; CHECK-CAS-O0-NEXT: str q0, [x8]
-; CHECK-CAS-O0-NEXT: add sp, sp, #64
+; CHECK-CAS-O0-NEXT: add sp, sp, #48
; CHECK-CAS-O0-NEXT: ret
%r = load atomic i128, i128* %p monotonic, align 16
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-and.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-and.mir
index 41bf07a8c8158..5196cc0b26e7f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-and.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-and.mir
@@ -44,87 +44,38 @@ body: |
; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD1]](s32), [[DEF]](s32)
; CHECK: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
- ; CHECK: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
- ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C3]], [[C4]]
- ; CHECK: [[SUB1:%[0-9]+]]:_(s64) = G_SUB [[C4]], [[C3]]
- ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[C3]](s64), [[C4]]
- ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP]](s32)
- ; CHECK: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[C3]](s64), [[C]]
- ; CHECK: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP1]](s32)
; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[MV]], [[C3]](s64)
- ; CHECK: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[MV]], [[SUB1]](s64)
; CHECK: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[DEF1]], [[C3]](s64)
- ; CHECK: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL1]]
- ; CHECK: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[MV]], [[SUB]](s64)
- ; CHECK: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC]](s1), [[SHL]], [[C]]
- ; CHECK: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC]](s1), [[OR]], [[SHL2]]
- ; CHECK: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC1]](s1), [[DEF1]], [[SELECT1]]
- ; CHECK: [[OR1:%[0-9]+]]:_(s64) = G_OR [[SELECT]], [[ZEXTLOAD]]
- ; CHECK: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SELECT2]], [[C]]
- ; CHECK: [[SUB2:%[0-9]+]]:_(s64) = G_SUB [[C4]], [[C4]]
- ; CHECK: [[SUB3:%[0-9]+]]:_(s64) = G_SUB [[C4]], [[C4]]
- ; CHECK: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[C4]](s64), [[C4]]
- ; CHECK: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP2]](s32)
- ; CHECK: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[C4]](s64), [[C]]
- ; CHECK: [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP3]](s32)
- ; CHECK: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[OR1]], [[C4]](s64)
- ; CHECK: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[OR1]], [[SUB3]](s64)
- ; CHECK: [[SHL4:%[0-9]+]]:_(s64) = G_SHL [[OR2]], [[C4]](s64)
- ; CHECK: [[OR3:%[0-9]+]]:_(s64) = G_OR [[LSHR1]], [[SHL4]]
- ; CHECK: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[OR1]], [[SUB2]](s64)
- ; CHECK: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC2]](s1), [[SHL3]], [[C]]
- ; CHECK: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC2]](s1), [[OR3]], [[SHL5]]
- ; CHECK: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC3]](s1), [[OR2]], [[SELECT4]]
- ; CHECK: [[OR4:%[0-9]+]]:_(s64) = G_OR [[SELECT3]], [[LOAD]]
- ; CHECK: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SELECT5]], [[C]]
+ ; CHECK: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
+ ; CHECK: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[MV]], [[C4]](s64)
+ ; CHECK: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[LSHR]]
+ ; CHECK: [[OR1:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[ZEXTLOAD]]
+ ; CHECK: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+ ; CHECK: [[OR2:%[0-9]+]]:_(s64) = G_OR [[C]], [[LOAD]]
+ ; CHECK: [[OR3:%[0-9]+]]:_(s64) = G_OR [[OR1]], [[C]]
; CHECK: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD %ptr(p0) :: (load (s64), align 16)
; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0)
; CHECK: [[ZEXTLOAD1:%[0-9]+]]:_(s64) = G_ZEXTLOAD [[COPY1]](p0) :: (load (s16) from unknown-address + 8, align 8)
; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C2]](s64)
; CHECK: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 10, align 2)
; CHECK: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD3]](s32), [[DEF]](s32)
- ; CHECK: [[SUB4:%[0-9]+]]:_(s64) = G_SUB [[C3]], [[C4]]
- ; CHECK: [[SUB5:%[0-9]+]]:_(s64) = G_SUB [[C4]], [[C3]]
- ; CHECK: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[C3]](s64), [[C4]]
- ; CHECK: [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP4]](s32)
- ; CHECK: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[C3]](s64), [[C]]
- ; CHECK: [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP5]](s32)
- ; CHECK: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[MV1]], [[C3]](s64)
- ; CHECK: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[MV1]], [[SUB5]](s64)
- ; CHECK: [[SHL7:%[0-9]+]]:_(s64) = G_SHL [[DEF1]], [[C3]](s64)
- ; CHECK: [[OR6:%[0-9]+]]:_(s64) = G_OR [[LSHR2]], [[SHL7]]
- ; CHECK: [[SHL8:%[0-9]+]]:_(s64) = G_SHL [[MV1]], [[SUB4]](s64)
- ; CHECK: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC4]](s1), [[SHL6]], [[C]]
- ; CHECK: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC4]](s1), [[OR6]], [[SHL8]]
- ; CHECK: [[SELECT8:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC5]](s1), [[DEF1]], [[SELECT7]]
- ; CHECK: [[OR7:%[0-9]+]]:_(s64) = G_OR [[SELECT6]], [[ZEXTLOAD1]]
- ; CHECK: [[OR8:%[0-9]+]]:_(s64) = G_OR [[SELECT8]], [[C]]
- ; CHECK: [[SUB6:%[0-9]+]]:_(s64) = G_SUB [[C4]], [[C4]]
- ; CHECK: [[SUB7:%[0-9]+]]:_(s64) = G_SUB [[C4]], [[C4]]
- ; CHECK: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[C4]](s64), [[C4]]
- ; CHECK: [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP6]](s32)
- ; CHECK: [[ICMP7:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[C4]](s64), [[C]]
- ; CHECK: [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP7]](s32)
- ; CHECK: [[SHL9:%[0-9]+]]:_(s64) = G_SHL [[OR7]], [[C4]](s64)
- ; CHECK: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR [[OR7]], [[SUB7]](s64)
- ; CHECK: [[SHL10:%[0-9]+]]:_(s64) = G_SHL [[OR8]], [[C4]](s64)
- ; CHECK: [[OR9:%[0-9]+]]:_(s64) = G_OR [[LSHR3]], [[SHL10]]
- ; CHECK: [[SHL11:%[0-9]+]]:_(s64) = G_SHL [[OR7]], [[SUB6]](s64)
- ; CHECK: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC6]](s1), [[SHL9]], [[C]]
- ; CHECK: [[SELECT10:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC6]](s1), [[OR9]], [[SHL11]]
- ; CHECK: [[SELECT11:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC7]](s1), [[OR8]], [[SELECT10]]
- ; CHECK: [[OR10:%[0-9]+]]:_(s64) = G_OR [[SELECT9]], [[LOAD2]]
- ; CHECK: [[OR11:%[0-9]+]]:_(s64) = G_OR [[SELECT11]], [[C]]
- ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[OR4]], [[OR10]]
- ; CHECK: [[AND1:%[0-9]+]]:_(s64) = G_AND [[OR5]], [[OR11]]
+ ; CHECK: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[MV1]], [[C3]](s64)
+ ; CHECK: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[DEF1]], [[C3]](s64)
+ ; CHECK: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[MV1]], [[C4]](s64)
+ ; CHECK: [[OR4:%[0-9]+]]:_(s64) = G_OR [[SHL3]], [[LSHR1]]
+ ; CHECK: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXTLOAD1]]
+ ; CHECK: [[OR6:%[0-9]+]]:_(s64) = G_OR [[C]], [[LOAD2]]
+ ; CHECK: [[OR7:%[0-9]+]]:_(s64) = G_OR [[OR5]], [[C]]
+ ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[OR2]], [[OR6]]
+ ; CHECK: [[AND1:%[0-9]+]]:_(s64) = G_AND [[OR3]], [[OR7]]
; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY [[AND]](s64)
- ; CHECK: [[TRUNC8:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
+ ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
; CHECK: G_STORE [[COPY2]](s64), %ptr(p0) :: (store (s64), align 16)
- ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[TRUNC8]](s32)
- ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C3]](s64)
+ ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
+ ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C3]](s64)
; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD]], [[C2]](s64)
; CHECK: G_STORE [[COPY3]](s32), [[PTR_ADD]](p0) :: (store (s16) into unknown-address + 8, align 8)
- ; CHECK: G_STORE [[LSHR4]](s32), [[PTR_ADD3]](p0) :: (store (s8) into unknown-address + 10, align 2)
+ ; CHECK: G_STORE [[LSHR2]](s32), [[PTR_ADD3]](p0) :: (store (s8) into unknown-address + 10, align 2)
%ptr:_(p0) = COPY $x0
%a:_(s88) = G_LOAD %ptr(p0) :: (load (s88))
%b:_(s88) = G_LOAD %ptr(p0) :: (load (s88))
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bswap.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bswap.mir
index fd2f013542c38..e15ad817e921d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bswap.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bswap.mir
@@ -151,22 +151,11 @@ body: |
; CHECK: [[BSWAP1:%[0-9]+]]:_(s64) = G_BSWAP [[DEF]]
; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 40
; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
- ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C]], [[C2]]
- ; CHECK: [[SUB1:%[0-9]+]]:_(s64) = G_SUB [[C2]], [[C]]
- ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[C]](s64), [[C2]]
- ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP]](s32)
- ; CHECK: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[C]](s64), [[C1]]
- ; CHECK: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP1]](s32)
- ; CHECK: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[BSWAP1]], [[C]](s64)
- ; CHECK: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[BSWAP]], [[C]](s64)
- ; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[BSWAP1]], [[SUB1]](s64)
- ; CHECK: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR1]], [[SHL]]
- ; CHECK: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[BSWAP1]], [[SUB]](s64)
- ; CHECK: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC]](s1), [[OR]], [[LSHR2]]
- ; CHECK: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC1]](s1), [[BSWAP]], [[SELECT]]
- ; CHECK: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC]](s1), [[LSHR]], [[C1]]
- ; CHECK: $x0 = COPY [[SELECT1]](s64)
+ ; CHECK: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[BSWAP]], [[C]](s64)
+ ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
+ ; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[BSWAP1]], [[C2]](s64)
+ ; CHECK: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL]]
+ ; CHECK: $x0 = COPY [[OR]](s64)
; CHECK: RET_ReallyLR implicit $x0
%val:_(s88) = G_IMPLICIT_DEF
%bswap:_(s88) = G_BSWAP %val
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
index 9b37f664e53d6..c1653d121e0ef 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
@@ -609,48 +609,24 @@ body: |
; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD1]](s32), [[DEF]](s32)
; CHECK: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
- ; CHECK: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
- ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C3]], [[C4]]
- ; CHECK: [[SUB1:%[0-9]+]]:_(s64) = G_SUB [[C4]], [[C3]]
- ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[C3]](s64), [[C4]]
- ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP]](s32)
- ; CHECK: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[C3]](s64), [[C]]
- ; CHECK: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP1]](s32)
; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[MV]], [[C3]](s64)
- ; CHECK: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[MV]], [[SUB1]](s64)
; CHECK: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[DEF1]], [[C3]](s64)
- ; CHECK: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL1]]
- ; CHECK: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[MV]], [[SUB]](s64)
- ; CHECK: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC]](s1), [[SHL]], [[C]]
- ; CHECK: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC]](s1), [[OR]], [[SHL2]]
- ; CHECK: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC1]](s1), [[DEF1]], [[SELECT1]]
- ; CHECK: [[OR1:%[0-9]+]]:_(s64) = G_OR [[SELECT]], [[ZEXTLOAD]]
- ; CHECK: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SELECT2]], [[C]]
- ; CHECK: [[SUB2:%[0-9]+]]:_(s64) = G_SUB [[C4]], [[C4]]
- ; CHECK: [[SUB3:%[0-9]+]]:_(s64) = G_SUB [[C4]], [[C4]]
- ; CHECK: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[C4]](s64), [[C4]]
- ; CHECK: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP2]](s32)
- ; CHECK: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[C4]](s64), [[C]]
- ; CHECK: [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP3]](s32)
- ; CHECK: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[OR1]], [[C4]](s64)
- ; CHECK: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[OR1]], [[SUB3]](s64)
- ; CHECK: [[SHL4:%[0-9]+]]:_(s64) = G_SHL [[OR2]], [[C4]](s64)
- ; CHECK: [[OR3:%[0-9]+]]:_(s64) = G_OR [[LSHR1]], [[SHL4]]
- ; CHECK: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[OR1]], [[SUB2]](s64)
- ; CHECK: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC2]](s1), [[SHL3]], [[C]]
- ; CHECK: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC2]](s1), [[OR3]], [[SHL5]]
- ; CHECK: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC3]](s1), [[OR2]], [[SELECT4]]
- ; CHECK: [[OR4:%[0-9]+]]:_(s64) = G_OR [[SELECT3]], [[LOAD]]
- ; CHECK: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SELECT5]], [[C]]
- ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY [[OR4]](s64)
- ; CHECK: [[TRUNC4:%[0-9]+]]:_(s32) = G_TRUNC [[OR5]](s64)
+ ; CHECK: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
+ ; CHECK: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[MV]], [[C4]](s64)
+ ; CHECK: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[LSHR]]
+ ; CHECK: [[OR1:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[ZEXTLOAD]]
+ ; CHECK: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+ ; CHECK: [[OR2:%[0-9]+]]:_(s64) = G_OR [[C]], [[LOAD]]
+ ; CHECK: [[OR3:%[0-9]+]]:_(s64) = G_OR [[OR1]], [[C]]
+ ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY [[OR2]](s64)
+ ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[OR3]](s64)
; CHECK: G_STORE [[COPY]](s64), %ptr(p0) :: (store (s64), align 16)
; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
- ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[TRUNC4]](s32)
- ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C3]](s64)
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
+ ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C3]](s64)
; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD2]], [[C2]](s64)
; CHECK: G_STORE [[COPY1]](s32), [[PTR_ADD2]](p0) :: (store (s16) into unknown-address + 8, align 8)
- ; CHECK: G_STORE [[LSHR2]](s32), [[PTR_ADD3]](p0) :: (store (s8) into unknown-address + 10, align 2)
+ ; CHECK: G_STORE [[LSHR1]](s32), [[PTR_ADD3]](p0) :: (store (s8) into unknown-address + 10, align 2)
; CHECK: RET_ReallyLR
%ptr:_(p0) = COPY $x0
%load:_(s88) = G_LOAD %ptr(p0) :: (load (s88))
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 7ad60a5a7f7fc..f25d1939bfa9a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -4688,26 +4688,26 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
; GFX6-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1]
; GFX6-NEXT: s_cmp_lg_u32 s19, 0
; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], 1
-; GFX6-NEXT: s_lshl_b32 s7, s6, 31
-; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
-; GFX6-NEXT: s_mov_b32 s6, s11
-; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
+; GFX6-NEXT: s_lshl_b32 s5, s6, 31
+; GFX6-NEXT: s_mov_b32 s4, s11
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 1
; GFX6-NEXT: s_sub_i32 s12, s8, 64
; GFX6-NEXT: s_sub_i32 s10, 64, s8
; GFX6-NEXT: s_cmp_lt_u32 s8, 64
; GFX6-NEXT: s_cselect_b32 s13, 1, 0
; GFX6-NEXT: s_cmp_eq_u32 s8, 0
; GFX6-NEXT: s_cselect_b32 s16, 1, 0
-; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s8
-; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], s10
-; GFX6-NEXT: s_lshr_b64 s[8:9], s[4:5], s8
+; GFX6-NEXT: s_lshr_b64 s[6:7], s[4:5], s8
+; GFX6-NEXT: s_lshl_b64 s[10:11], s[4:5], s10
+; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], s8
; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s12
+; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s12
; GFX6-NEXT: s_cmp_lg_u32 s13, 0
-; GFX6-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
; GFX6-NEXT: s_cmp_lg_u32 s13, 0
; GFX6-NEXT: s_cselect_b64 s[4:5], s[6:7], 0
; GFX6-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1]
@@ -4735,26 +4735,26 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
; GFX8-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1]
; GFX8-NEXT: s_cmp_lg_u32 s19, 0
; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], 1
-; GFX8-NEXT: s_lshl_b32 s7, s6, 31
-; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
-; GFX8-NEXT: s_mov_b32 s6, s11
-; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
+; GFX8-NEXT: s_lshl_b32 s5, s6, 31
+; GFX8-NEXT: s_mov_b32 s4, s11
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], 1
; GFX8-NEXT: s_sub_i32 s12, s8, 64
; GFX8-NEXT: s_sub_i32 s10, 64, s8
; GFX8-NEXT: s_cmp_lt_u32 s8, 64
; GFX8-NEXT: s_cselect_b32 s13, 1, 0
; GFX8-NEXT: s_cmp_eq_u32 s8, 0
; GFX8-NEXT: s_cselect_b32 s16, 1, 0
-; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s8
-; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], s10
-; GFX8-NEXT: s_lshr_b64 s[8:9], s[4:5], s8
+; GFX8-NEXT: s_lshr_b64 s[6:7], s[4:5], s8
+; GFX8-NEXT: s_lshl_b64 s[10:11], s[4:5], s10
+; GFX8-NEXT: s_lshr_b64 s[8:9], s[0:1], s8
; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s12
+; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s12
; GFX8-NEXT: s_cmp_lg_u32 s13, 0
-; GFX8-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX8-NEXT: s_cmp_lg_u32 s16, 0
-; GFX8-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
; GFX8-NEXT: s_cmp_lg_u32 s13, 0
; GFX8-NEXT: s_cselect_b64 s[4:5], s[6:7], 0
; GFX8-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1]
@@ -4782,26 +4782,26 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
; GFX9-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1]
; GFX9-NEXT: s_cmp_lg_u32 s19, 0
; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], 1
-; GFX9-NEXT: s_lshl_b32 s7, s6, 31
-; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
-; GFX9-NEXT: s_mov_b32 s6, s11
-; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
+; GFX9-NEXT: s_lshl_b32 s5, s6, 31
+; GFX9-NEXT: s_mov_b32 s4, s11
+; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 1
; GFX9-NEXT: s_sub_i32 s12, s8, 64
; GFX9-NEXT: s_sub_i32 s10, 64, s8
; GFX9-NEXT: s_cmp_lt_u32 s8, 64
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
; GFX9-NEXT: s_cmp_eq_u32 s8, 0
; GFX9-NEXT: s_cselect_b32 s16, 1, 0
-; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s8
-; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], s10
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[4:5], s8
+; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], s8
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], s10
+; GFX9-NEXT: s_lshr_b64 s[8:9], s[0:1], s8
; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s12
+; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s12
; GFX9-NEXT: s_cmp_lg_u32 s13, 0
-; GFX9-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX9-NEXT: s_cmp_lg_u32 s16, 0
-; GFX9-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
; GFX9-NEXT: s_cmp_lg_u32 s13, 0
; GFX9-NEXT: s_cselect_b64 s[4:5], s[6:7], 0
; GFX9-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1]
@@ -4832,21 +4832,21 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
; GFX10-NEXT: s_lshl_b32 s5, s6, 31
; GFX10-NEXT: s_mov_b32 s4, s11
-; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1
-; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX10-NEXT: s_sub_i32 s14, s8, 64
+; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10-NEXT: s_lshr_b64 s[4:5], s[6:7], 1
; GFX10-NEXT: s_sub_i32 s9, 64, s8
; GFX10-NEXT: s_cmp_lt_u32 s8, 64
; GFX10-NEXT: s_cselect_b32 s15, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s8, 0
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[4:5], s[0:1], s8
-; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s9
-; GFX10-NEXT: s_lshr_b64 s[8:9], s[6:7], s8
-; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11]
-; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s14
+; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s8
+; GFX10-NEXT: s_lshl_b64 s[10:11], s[4:5], s9
+; GFX10-NEXT: s_lshr_b64 s[8:9], s[4:5], s8
+; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s14
; GFX10-NEXT: s_cmp_lg_u32 s15, 0
-; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5]
; GFX10-NEXT: s_cmp_lg_u32 s16, 0
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
; GFX10-NEXT: s_cmp_lg_u32 s15, 0
@@ -4882,24 +4882,24 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14
; GFX6-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc
-; GFX6-NEXT: v_lshr_b64 v[2:3], v[4:5], 1
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[6:7], 1
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 31, v6
-; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], 1
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v6
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[6:7], 1
; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v15
; GFX6-NEXT: v_subrev_i32_e32 v14, vcc, 64, v15
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[2:3], v15
-; GFX6-NEXT: v_lshl_b64 v[6:7], v[0:1], v6
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v15
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], v14
+; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v15
+; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], v6
+; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v15
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v14
; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
; GFX6-NEXT: v_or_b32_e32 v5, v5, v7
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
; GFX6-NEXT: v_or_b32_e32 v0, v10, v0
@@ -4931,24 +4931,24 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14
; GFX8-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc
-; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[4:5]
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[6:7]
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 31, v6
-; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5]
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v6
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7]
; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v15
; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, 64, v15
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v15, v[2:3]
-; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[0:1]
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], v15, v[0:1]
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v14, v[0:1]
+; GFX8-NEXT: v_lshrrev_b64 v[4:5], v15, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3]
+; GFX8-NEXT: v_lshrrev_b64 v[8:9], v15, v[2:3]
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], v14, v[2:3]
; GFX8-NEXT: v_or_b32_e32 v4, v4, v6
; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
; GFX8-NEXT: v_or_b32_e32 v5, v5, v7
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
; GFX8-NEXT: v_or_b32_e32 v0, v10, v0
@@ -4980,24 +4980,24 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14
; GFX9-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[4:5]
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[6:7]
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 31, v6
-; GFX9-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5]
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v6
+; GFX9-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7]
; GFX9-NEXT: v_sub_u32_e32 v6, 64, v15
; GFX9-NEXT: v_subrev_u32_e32 v14, 64, v15
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], v15, v[2:3]
-; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[0:1]
-; GFX9-NEXT: v_lshrrev_b64 v[8:9], v15, v[0:1]
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v14, v[0:1]
+; GFX9-NEXT: v_lshrrev_b64 v[4:5], v15, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3]
+; GFX9-NEXT: v_lshrrev_b64 v[8:9], v15, v[2:3]
+; GFX9-NEXT: v_lshrrev_b64 v[2:3], v14, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v4, v4, v6
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
; GFX9-NEXT: v_or_b32_e32 v5, v5, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
; GFX9-NEXT: v_or_b32_e32 v0, v10, v0
@@ -5083,26 +5083,26 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
; GFX6-NEXT: v_mov_b32_e32 v3, s3
-; GFX6-NEXT: s_lshr_b64 s[2:3], s[4:5], 1
+; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
; GFX6-NEXT: s_lshl_b32 s9, s6, 31
-; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], 1
+; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], 1
; GFX6-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v7
-; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v7
-; GFX6-NEXT: v_lshl_b64 v[2:3], s[0:1], v2
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v7
+; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v2
; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v7
; GFX6-NEXT: v_or_b32_e32 v2, v0, v2
; GFX6-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v11
+; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v11
; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX6-NEXT: v_lshr_b64 v[4:5], s[0:1], v7
+; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v7
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
+; GFX6-NEXT: v_mov_b32_e32 v2, s0
+; GFX6-NEXT: v_mov_b32_e32 v3, s1
; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7
-; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
@@ -5136,26 +5136,26 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_lshr_b64 s[2:3], s[4:5], 1
+; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
; GFX8-NEXT: s_lshl_b32 s9, s6, 31
-; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], 1
+; GFX8-NEXT: s_lshr_b64 s[2:3], s[6:7], 1
; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v7
-; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v7, s[2:3]
-; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1]
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v7
; GFX8-NEXT: v_or_b32_e32 v2, v0, v2
; GFX8-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[0:1]
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3]
; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v7, s[0:1]
+; GFX8-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
@@ -5188,27 +5188,27 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 1
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
; GFX9-NEXT: s_lshl_b32 s9, s6, 31
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc
-; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], 1
-; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX9-NEXT: s_lshr_b64 s[2:3], s[6:7], 1
+; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
; GFX9-NEXT: v_sub_u32_e32 v2, 64, v7
; GFX9-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v7, s[2:3]
-; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1]
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v7
; GFX9-NEXT: v_or_b32_e32 v2, v0, v2
; GFX9-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[0:1]
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3]
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], v7, s[0:1]
+; GFX9-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3]
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
@@ -5295,33 +5295,33 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX6-NEXT: s_cmp_lg_u32 s13, 0
; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 1
; GFX6-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT: v_lshr_b64 v[3:4], v[2:3], 1
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 31, v2
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 1
; GFX6-NEXT: s_sub_i32 s3, 64, s4
; GFX6-NEXT: s_sub_i32 s2, s4, 64
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v4
; GFX6-NEXT: s_cmp_lt_u32 s4, 64
; GFX6-NEXT: s_cselect_b32 s5, 1, 0
; GFX6-NEXT: s_cmp_eq_u32 s4, 0
-; GFX6-NEXT: v_lshr_b64 v[5:6], v[0:1], s4
-; GFX6-NEXT: v_lshl_b64 v[7:8], v[3:4], s3
-; GFX6-NEXT: v_lshr_b64 v[9:10], v[3:4], s4
+; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s4
+; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s3
+; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s4
; GFX6-NEXT: s_cselect_b32 s8, 1, 0
-; GFX6-NEXT: v_lshr_b64 v[2:3], v[3:4], s2
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s2
; GFX6-NEXT: s_and_b32 s2, 1, s5
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_or_b32_e32 v5, v5, v7
-; GFX6-NEXT: v_or_b32_e32 v6, v6, v8
; GFX6-NEXT: s_and_b32 s2, 1, s8
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
; GFX6-NEXT: s_and_b32 s2, 1, s5
; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v9, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v10, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
; GFX6-NEXT: v_or_b32_e32 v0, s6, v0
; GFX6-NEXT: v_or_b32_e32 v1, s7, v1
; GFX6-NEXT: v_or_b32_e32 v2, s0, v2
@@ -5350,33 +5350,33 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX8-NEXT: s_cmp_lg_u32 s13, 0
; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
; GFX8-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT: v_lshrrev_b64 v[3:4], 1, v[2:3]
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 31, v2
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
; GFX8-NEXT: s_sub_i32 s3, 64, s4
; GFX8-NEXT: s_sub_i32 s2, s4, 64
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
; GFX8-NEXT: s_cmp_lt_u32 s4, 64
; GFX8-NEXT: s_cselect_b32 s5, 1, 0
; GFX8-NEXT: s_cmp_eq_u32 s4, 0
-; GFX8-NEXT: v_lshrrev_b64 v[5:6], s4, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[7:8], s3, v[3:4]
-; GFX8-NEXT: v_lshrrev_b64 v[9:10], s4, v[3:4]
+; GFX8-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[6:7], s3, v[2:3]
+; GFX8-NEXT: v_lshrrev_b64 v[8:9], s4, v[2:3]
; GFX8-NEXT: s_cselect_b32 s8, 1, 0
-; GFX8-NEXT: v_lshrrev_b64 v[2:3], s2, v[3:4]
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], s2, v[2:3]
; GFX8-NEXT: s_and_b32 s2, 1, s5
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v6
; GFX8-NEXT: v_or_b32_e32 v5, v5, v7
-; GFX8-NEXT: v_or_b32_e32 v6, v6, v8
; GFX8-NEXT: s_and_b32 s2, 1, s8
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
; GFX8-NEXT: s_and_b32 s2, 1, s5
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
; GFX8-NEXT: v_or_b32_e32 v0, s6, v0
; GFX8-NEXT: v_or_b32_e32 v1, s7, v1
; GFX8-NEXT: v_or_b32_e32 v2, s0, v2
@@ -5405,33 +5405,33 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX9-NEXT: s_cmp_lg_u32 s13, 0
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
; GFX9-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT: v_lshrrev_b64 v[3:4], 1, v[2:3]
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 31, v2
+; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
; GFX9-NEXT: s_sub_i32 s3, 64, s4
; GFX9-NEXT: s_sub_i32 s2, s4, 64
-; GFX9-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT: v_or_b32_e32 v1, v1, v4
; GFX9-NEXT: s_cmp_lt_u32 s4, 64
; GFX9-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-NEXT: s_cmp_eq_u32 s4, 0
-; GFX9-NEXT: v_lshrrev_b64 v[5:6], s4, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[7:8], s3, v[3:4]
-; GFX9-NEXT: v_lshrrev_b64 v[9:10], s4, v[3:4]
+; GFX9-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[6:7], s3, v[2:3]
+; GFX9-NEXT: v_lshrrev_b64 v[8:9], s4, v[2:3]
; GFX9-NEXT: s_cselect_b32 s8, 1, 0
-; GFX9-NEXT: v_lshrrev_b64 v[2:3], s2, v[3:4]
+; GFX9-NEXT: v_lshrrev_b64 v[2:3], s2, v[2:3]
; GFX9-NEXT: s_and_b32 s2, 1, s5
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
+; GFX9-NEXT: v_or_b32_e32 v4, v4, v6
; GFX9-NEXT: v_or_b32_e32 v5, v5, v7
-; GFX9-NEXT: v_or_b32_e32 v6, v6, v8
; GFX9-NEXT: s_and_b32 s2, 1, s8
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
; GFX9-NEXT: s_and_b32 s2, 1, s5
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v9, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v10, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
; GFX9-NEXT: v_or_b32_e32 v0, s6, v0
; GFX9-NEXT: v_or_b32_e32 v1, s7, v1
; GFX9-NEXT: v_or_b32_e32 v2, s0, v2
@@ -5512,15 +5512,15 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], s8
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s5
; GFX6-NEXT: s_and_b32 s5, 1, s9
-; GFX6-NEXT: s_lshr_b64 s[8:9], s[2:3], 1
-; GFX6-NEXT: s_lshl_b32 s3, s2, 31
+; GFX6-NEXT: s_lshl_b32 s9, s2, 31
; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
-; GFX6-NEXT: s_mov_b32 s2, s7
+; GFX6-NEXT: s_mov_b32 s8, s7
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5
; GFX6-NEXT: s_and_b32 s5, 1, s10
-; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
; GFX6-NEXT: s_sub_i32 s10, s4, 64
-; GFX6-NEXT: s_sub_i32 s6, 64, s4
+; GFX6-NEXT: s_sub_i32 s8, 64, s4
; GFX6-NEXT: s_cmp_lt_u32 s4, 64
; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_or_b32_e32 v5, v5, v7
@@ -5532,19 +5532,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5
-; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], s4
+; GFX6-NEXT: s_lshr_b64 s[6:7], s[2:3], s4
+; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
; GFX6-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
-; GFX6-NEXT: s_lshl_b64 s[6:7], s[8:9], s6
-; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX6-NEXT: s_lshr_b64 s[6:7], s[8:9], s10
+; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GFX6-NEXT: s_cmp_lg_u32 s11, 0
-; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX6-NEXT: s_cmp_lg_u32 s11, 0
; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
+; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], 0
; GFX6-NEXT: v_or_b32_e32 v0, s0, v6
; GFX6-NEXT: v_or_b32_e32 v1, s1, v7
; GFX6-NEXT: v_or_b32_e32 v2, s2, v2
@@ -5567,15 +5567,15 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX8-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1]
; GFX8-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1]
; GFX8-NEXT: s_and_b32 s5, 1, s9
-; GFX8-NEXT: s_lshr_b64 s[8:9], s[2:3], 1
-; GFX8-NEXT: s_lshl_b32 s3, s2, 31
+; GFX8-NEXT: s_lshl_b32 s9, s2, 31
; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
-; GFX8-NEXT: s_mov_b32 s2, s7
+; GFX8-NEXT: s_mov_b32 s8, s7
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5
; GFX8-NEXT: s_and_b32 s5, 1, s10
-; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
; GFX8-NEXT: s_sub_i32 s10, s4, 64
-; GFX8-NEXT: s_sub_i32 s6, 64, s4
+; GFX8-NEXT: s_sub_i32 s8, 64, s4
; GFX8-NEXT: s_cmp_lt_u32 s4, 64
; GFX8-NEXT: v_or_b32_e32 v4, v4, v6
; GFX8-NEXT: v_or_b32_e32 v5, v5, v7
@@ -5587,19 +5587,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5
-; GFX8-NEXT: s_lshr_b64 s[2:3], s[8:9], s4
+; GFX8-NEXT: s_lshr_b64 s[6:7], s[2:3], s4
+; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
; GFX8-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
-; GFX8-NEXT: s_lshl_b64 s[6:7], s[8:9], s6
-; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX8-NEXT: s_lshr_b64 s[6:7], s[8:9], s10
+; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GFX8-NEXT: s_cmp_lg_u32 s11, 0
-; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
; GFX8-NEXT: s_cmp_lg_u32 s12, 0
-; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_cmp_lg_u32 s11, 0
; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], 0
; GFX8-NEXT: v_or_b32_e32 v0, s0, v6
; GFX8-NEXT: v_or_b32_e32 v1, s1, v7
; GFX8-NEXT: v_or_b32_e32 v2, s2, v2
@@ -5622,15 +5622,15 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX9-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1]
; GFX9-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1]
; GFX9-NEXT: s_and_b32 s5, 1, s9
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[2:3], 1
-; GFX9-NEXT: s_lshl_b32 s3, s2, 31
+; GFX9-NEXT: s_lshl_b32 s9, s2, 31
; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT: s_mov_b32 s2, s7
+; GFX9-NEXT: s_mov_b32 s8, s7
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5
; GFX9-NEXT: s_and_b32 s5, 1, s10
-; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
; GFX9-NEXT: s_sub_i32 s10, s4, 64
-; GFX9-NEXT: s_sub_i32 s6, 64, s4
+; GFX9-NEXT: s_sub_i32 s8, 64, s4
; GFX9-NEXT: s_cmp_lt_u32 s4, 64
; GFX9-NEXT: v_or_b32_e32 v4, v4, v6
; GFX9-NEXT: v_or_b32_e32 v5, v5, v7
@@ -5642,19 +5642,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5
-; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s4
+; GFX9-NEXT: s_lshr_b64 s[6:7], s[2:3], s4
+; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
; GFX9-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[8:9], s6
-; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-NEXT: s_lshr_b64 s[6:7], s[8:9], s10
+; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GFX9-NEXT: s_cmp_lg_u32 s11, 0
-; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
; GFX9-NEXT: s_cmp_lg_u32 s12, 0
-; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX9-NEXT: s_cmp_lg_u32 s11, 0
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], 0
; GFX9-NEXT: v_or_b32_e32 v0, s0, v6
; GFX9-NEXT: v_or_b32_e32 v1, s1, v7
; GFX9-NEXT: v_or_b32_e32 v2, s2, v2
@@ -5723,50 +5723,46 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
define amdgpu_ps i128 @s_fshl_i128_65(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-LABEL: s_fshl_i128_65:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_mov_b32 s9, 0
; GFX6-NEXT: s_lshl_b64 s[2:3], s[0:1], 1
-; GFX6-NEXT: s_lshr_b32 s8, s7, 31
-; GFX6-NEXT: s_lshr_b32 s0, s5, 31
-; GFX6-NEXT: s_mov_b32 s1, s9
-; GFX6-NEXT: s_lshl_b64 s[4:5], s[6:7], 1
-; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX6-NEXT: s_lshr_b32 s4, s5, 31
+; GFX6-NEXT: s_mov_b32 s5, 0
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[6:7], 1
+; GFX6-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX6-NEXT: s_lshr_b32 s4, s7, 31
+; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fshl_i128_65:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_mov_b32 s9, 0
; GFX8-NEXT: s_lshl_b64 s[2:3], s[0:1], 1
-; GFX8-NEXT: s_lshr_b32 s8, s7, 31
-; GFX8-NEXT: s_lshr_b32 s0, s5, 31
-; GFX8-NEXT: s_mov_b32 s1, s9
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 1
-; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX8-NEXT: s_lshr_b32 s4, s5, 31
+; GFX8-NEXT: s_mov_b32 s5, 0
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], 1
+; GFX8-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX8-NEXT: s_lshr_b32 s4, s7, 31
+; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fshl_i128_65:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s9, 0
; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 1
-; GFX9-NEXT: s_lshr_b32 s8, s7, 31
-; GFX9-NEXT: s_lshr_b32 s0, s5, 31
-; GFX9-NEXT: s_mov_b32 s1, s9
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 1
-; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX9-NEXT: s_lshr_b32 s4, s5, 31
+; GFX9-NEXT: s_mov_b32 s5, 0
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 1
+; GFX9-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX9-NEXT: s_lshr_b32 s4, s7, 31
+; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fshl_i128_65:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_mov_b32 s9, 0
-; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 1
-; GFX10-NEXT: s_lshr_b32 s0, s5, 31
-; GFX10-NEXT: s_lshr_b32 s8, s7, 31
-; GFX10-NEXT: s_mov_b32 s1, s9
+; GFX10-NEXT: s_lshr_b32 s2, s5, 31
+; GFX10-NEXT: s_mov_b32 s3, 0
; GFX10-NEXT: s_lshl_b64 s[4:5], s[6:7], 1
-; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10-NEXT: s_lshl_b64 s[8:9], s[0:1], 1
+; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5]
+; GFX10-NEXT: s_lshr_b32 s2, s7, 31
+; GFX10-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3]
; GFX10-NEXT: ; return to shader part epilog
%result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65)
ret i128 %result
@@ -5778,9 +5774,9 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshl_b64 v[2:3], v[0:1], 1
; GFX6-NEXT: v_lshl_b64 v[0:1], v[6:7], 1
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5
+; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v7
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 31, v5
-; GFX6-NEXT: v_or_b32_e32 v0, v5, v0
; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -5789,9 +5785,9 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1]
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[6:7]
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v7
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 31, v5
-; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: v_or_b32_e32 v2, v2, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -5800,9 +5796,9 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1]
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[6:7]
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5
+; GFX9-NEXT: v_or_b32_e32 v0, v4, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v7
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 31, v5
-; GFX9-NEXT: v_or_b32_e32 v0, v5, v0
; GFX9-NEXT: v_or_b32_e32 v2, v2, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -5843,26 +5839,26 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX6-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1]
; GFX6-NEXT: s_cmp_lg_u32 s29, 0
; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX6-NEXT: s_lshr_b64 s[0:1], s[10:11], 1
-; GFX6-NEXT: s_lshl_b32 s11, s10, 31
-; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], 1
-; GFX6-NEXT: s_mov_b32 s10, s19
-; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX6-NEXT: s_lshr_b64 s[0:1], s[8:9], 1
+; GFX6-NEXT: s_lshl_b32 s9, s10, 31
+; GFX6-NEXT: s_mov_b32 s8, s19
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX6-NEXT: s_lshr_b64 s[8:9], s[10:11], 1
; GFX6-NEXT: s_sub_i32 s26, s16, 64
; GFX6-NEXT: s_sub_i32 s22, 64, s16
; GFX6-NEXT: s_cmp_lt_u32 s16, 64
; GFX6-NEXT: s_cselect_b32 s27, 1, 0
; GFX6-NEXT: s_cmp_eq_u32 s16, 0
; GFX6-NEXT: s_cselect_b32 s28, 1, 0
-; GFX6-NEXT: s_lshr_b64 s[10:11], s[0:1], s16
-; GFX6-NEXT: s_lshl_b64 s[22:23], s[0:1], s22
-; GFX6-NEXT: s_lshr_b64 s[16:17], s[8:9], s16
+; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s16
+; GFX6-NEXT: s_lshl_b64 s[22:23], s[8:9], s22
+; GFX6-NEXT: s_lshr_b64 s[16:17], s[0:1], s16
; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23]
-; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s26
+; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s26
; GFX6-NEXT: s_cmp_lg_u32 s27, 0
-; GFX6-NEXT: s_cselect_b64 s[0:1], s[16:17], s[0:1]
+; GFX6-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9]
; GFX6-NEXT: s_cmp_lg_u32 s28, 0
-; GFX6-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
; GFX6-NEXT: s_cmp_lg_u32 s27, 0
; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], 0
; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
@@ -5881,30 +5877,30 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX6-NEXT: s_or_b64 s[8:9], s[20:21], s[8:9]
; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s11
; GFX6-NEXT: s_cmp_lg_u32 s18, 0
-; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], 0
; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], 0
; GFX6-NEXT: s_cmp_lg_u32 s22, 0
; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5]
-; GFX6-NEXT: s_lshr_b64 s[8:9], s[12:13], 1
-; GFX6-NEXT: s_lshl_b32 s13, s14, 31
-; GFX6-NEXT: s_mov_b32 s12, s19
-; GFX6-NEXT: s_lshr_b64 s[4:5], s[14:15], 1
-; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX6-NEXT: s_lshr_b64 s[4:5], s[12:13], 1
+; GFX6-NEXT: s_lshl_b32 s9, s14, 31
+; GFX6-NEXT: s_mov_b32 s8, s19
+; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT: s_lshr_b64 s[8:9], s[14:15], 1
; GFX6-NEXT: s_sub_i32 s18, s10, 64
; GFX6-NEXT: s_sub_i32 s14, 64, s10
; GFX6-NEXT: s_cmp_lt_u32 s10, 64
; GFX6-NEXT: s_cselect_b32 s19, 1, 0
; GFX6-NEXT: s_cmp_eq_u32 s10, 0
; GFX6-NEXT: s_cselect_b32 s20, 1, 0
-; GFX6-NEXT: s_lshr_b64 s[12:13], s[4:5], s10
-; GFX6-NEXT: s_lshl_b64 s[14:15], s[4:5], s14
-; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s10
+; GFX6-NEXT: s_lshr_b64 s[12:13], s[8:9], s10
+; GFX6-NEXT: s_lshl_b64 s[14:15], s[8:9], s14
+; GFX6-NEXT: s_lshr_b64 s[10:11], s[4:5], s10
; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s18
+; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s18
; GFX6-NEXT: s_cmp_lg_u32 s19, 0
-; GFX6-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9]
; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
; GFX6-NEXT: s_cmp_lg_u32 s19, 0
; GFX6-NEXT: s_cselect_b64 s[8:9], s[12:13], 0
; GFX6-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5]
@@ -5932,26 +5928,26 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX8-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1]
; GFX8-NEXT: s_cmp_lg_u32 s29, 0
; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX8-NEXT: s_lshr_b64 s[0:1], s[10:11], 1
-; GFX8-NEXT: s_lshl_b32 s11, s10, 31
-; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], 1
-; GFX8-NEXT: s_mov_b32 s10, s19
-; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX8-NEXT: s_lshr_b64 s[0:1], s[8:9], 1
+; GFX8-NEXT: s_lshl_b32 s9, s10, 31
+; GFX8-NEXT: s_mov_b32 s8, s19
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX8-NEXT: s_lshr_b64 s[8:9], s[10:11], 1
; GFX8-NEXT: s_sub_i32 s26, s16, 64
; GFX8-NEXT: s_sub_i32 s22, 64, s16
; GFX8-NEXT: s_cmp_lt_u32 s16, 64
; GFX8-NEXT: s_cselect_b32 s27, 1, 0
; GFX8-NEXT: s_cmp_eq_u32 s16, 0
; GFX8-NEXT: s_cselect_b32 s28, 1, 0
-; GFX8-NEXT: s_lshr_b64 s[10:11], s[0:1], s16
-; GFX8-NEXT: s_lshl_b64 s[22:23], s[0:1], s22
-; GFX8-NEXT: s_lshr_b64 s[16:17], s[8:9], s16
+; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s16
+; GFX8-NEXT: s_lshl_b64 s[22:23], s[8:9], s22
+; GFX8-NEXT: s_lshr_b64 s[16:17], s[0:1], s16
; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23]
-; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s26
+; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s26
; GFX8-NEXT: s_cmp_lg_u32 s27, 0
-; GFX8-NEXT: s_cselect_b64 s[0:1], s[16:17], s[0:1]
+; GFX8-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9]
; GFX8-NEXT: s_cmp_lg_u32 s28, 0
-; GFX8-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
; GFX8-NEXT: s_cmp_lg_u32 s27, 0
; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], 0
; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
@@ -5970,30 +5966,30 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX8-NEXT: s_or_b64 s[8:9], s[20:21], s[8:9]
; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s11
; GFX8-NEXT: s_cmp_lg_u32 s18, 0
-; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], 0
; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], 0
; GFX8-NEXT: s_cmp_lg_u32 s22, 0
; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5]
-; GFX8-NEXT: s_lshr_b64 s[8:9], s[12:13], 1
-; GFX8-NEXT: s_lshl_b32 s13, s14, 31
-; GFX8-NEXT: s_mov_b32 s12, s19
-; GFX8-NEXT: s_lshr_b64 s[4:5], s[14:15], 1
-; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX8-NEXT: s_lshr_b64 s[4:5], s[12:13], 1
+; GFX8-NEXT: s_lshl_b32 s9, s14, 31
+; GFX8-NEXT: s_mov_b32 s8, s19
+; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT: s_lshr_b64 s[8:9], s[14:15], 1
; GFX8-NEXT: s_sub_i32 s18, s10, 64
; GFX8-NEXT: s_sub_i32 s14, 64, s10
; GFX8-NEXT: s_cmp_lt_u32 s10, 64
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: s_cmp_eq_u32 s10, 0
; GFX8-NEXT: s_cselect_b32 s20, 1, 0
-; GFX8-NEXT: s_lshr_b64 s[12:13], s[4:5], s10
-; GFX8-NEXT: s_lshl_b64 s[14:15], s[4:5], s14
-; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s10
+; GFX8-NEXT: s_lshr_b64 s[12:13], s[8:9], s10
+; GFX8-NEXT: s_lshl_b64 s[14:15], s[8:9], s14
+; GFX8-NEXT: s_lshr_b64 s[10:11], s[4:5], s10
; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s18
+; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s18
; GFX8-NEXT: s_cmp_lg_u32 s19, 0
-; GFX8-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9]
; GFX8-NEXT: s_cmp_lg_u32 s20, 0
-; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
; GFX8-NEXT: s_cmp_lg_u32 s19, 0
; GFX8-NEXT: s_cselect_b64 s[8:9], s[12:13], 0
; GFX8-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5]
@@ -6021,26 +6017,26 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX9-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1]
; GFX9-NEXT: s_cmp_lg_u32 s29, 0
; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX9-NEXT: s_lshr_b64 s[0:1], s[10:11], 1
-; GFX9-NEXT: s_lshl_b32 s11, s10, 31
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], 1
-; GFX9-NEXT: s_mov_b32 s10, s19
-; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], 1
+; GFX9-NEXT: s_lshl_b32 s9, s10, 31
+; GFX9-NEXT: s_mov_b32 s8, s19
+; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX9-NEXT: s_lshr_b64 s[8:9], s[10:11], 1
; GFX9-NEXT: s_sub_i32 s26, s16, 64
; GFX9-NEXT: s_sub_i32 s22, 64, s16
; GFX9-NEXT: s_cmp_lt_u32 s16, 64
; GFX9-NEXT: s_cselect_b32 s27, 1, 0
; GFX9-NEXT: s_cmp_eq_u32 s16, 0
; GFX9-NEXT: s_cselect_b32 s28, 1, 0
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[0:1], s16
-; GFX9-NEXT: s_lshl_b64 s[22:23], s[0:1], s22
-; GFX9-NEXT: s_lshr_b64 s[16:17], s[8:9], s16
+; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s16
+; GFX9-NEXT: s_lshl_b64 s[22:23], s[8:9], s22
+; GFX9-NEXT: s_lshr_b64 s[16:17], s[0:1], s16
; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23]
-; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s26
+; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s26
; GFX9-NEXT: s_cmp_lg_u32 s27, 0
-; GFX9-NEXT: s_cselect_b64 s[0:1], s[16:17], s[0:1]
+; GFX9-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9]
; GFX9-NEXT: s_cmp_lg_u32 s28, 0
-; GFX9-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
; GFX9-NEXT: s_cmp_lg_u32 s27, 0
; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], 0
; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
@@ -6059,30 +6055,30 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX9-NEXT: s_or_b64 s[8:9], s[20:21], s[8:9]
; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], s11
; GFX9-NEXT: s_cmp_lg_u32 s18, 0
-; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], 0
; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], 0
; GFX9-NEXT: s_cmp_lg_u32 s22, 0
; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5]
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], 1
-; GFX9-NEXT: s_lshl_b32 s13, s14, 31
-; GFX9-NEXT: s_mov_b32 s12, s19
-; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], 1
-; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX9-NEXT: s_lshr_b64 s[4:5], s[12:13], 1
+; GFX9-NEXT: s_lshl_b32 s9, s14, 31
+; GFX9-NEXT: s_mov_b32 s8, s19
+; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT: s_lshr_b64 s[8:9], s[14:15], 1
; GFX9-NEXT: s_sub_i32 s18, s10, 64
; GFX9-NEXT: s_sub_i32 s14, 64, s10
; GFX9-NEXT: s_cmp_lt_u32 s10, 64
; GFX9-NEXT: s_cselect_b32 s19, 1, 0
; GFX9-NEXT: s_cmp_eq_u32 s10, 0
; GFX9-NEXT: s_cselect_b32 s20, 1, 0
-; GFX9-NEXT: s_lshr_b64 s[12:13], s[4:5], s10
-; GFX9-NEXT: s_lshl_b64 s[14:15], s[4:5], s14
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s10
+; GFX9-NEXT: s_lshr_b64 s[12:13], s[8:9], s10
+; GFX9-NEXT: s_lshl_b64 s[14:15], s[8:9], s14
+; GFX9-NEXT: s_lshr_b64 s[10:11], s[4:5], s10
; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s18
+; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s18
; GFX9-NEXT: s_cmp_lg_u32 s19, 0
-; GFX9-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9]
; GFX9-NEXT: s_cmp_lg_u32 s20, 0
-; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
; GFX9-NEXT: s_cmp_lg_u32 s19, 0
; GFX9-NEXT: s_cselect_b64 s[8:9], s[12:13], 0
; GFX9-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5]
@@ -6113,21 +6109,21 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], 1
; GFX10-NEXT: s_lshl_b32 s9, s10, 31
; GFX10-NEXT: s_mov_b32 s8, s19
-; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], 1
-; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
; GFX10-NEXT: s_sub_i32 s26, s16, 64
+; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX10-NEXT: s_lshr_b64 s[8:9], s[10:11], 1
; GFX10-NEXT: s_sub_i32 s17, 64, s16
; GFX10-NEXT: s_cmp_lt_u32 s16, 64
; GFX10-NEXT: s_cselect_b32 s27, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s16, 0
; GFX10-NEXT: s_cselect_b32 s28, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[8:9], s[0:1], s16
-; GFX10-NEXT: s_lshl_b64 s[24:25], s[10:11], s17
-; GFX10-NEXT: s_lshr_b64 s[16:17], s[10:11], s16
-; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[24:25]
-; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s26
+; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s16
+; GFX10-NEXT: s_lshl_b64 s[24:25], s[8:9], s17
+; GFX10-NEXT: s_lshr_b64 s[16:17], s[8:9], s16
+; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[24:25]
+; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s26
; GFX10-NEXT: s_cmp_lg_u32 s27, 0
-; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9]
; GFX10-NEXT: s_cmp_lg_u32 s28, 0
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
; GFX10-NEXT: s_cmp_lg_u32 s27, 0
@@ -6155,21 +6151,21 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX10-NEXT: s_lshr_b64 s[4:5], s[12:13], 1
; GFX10-NEXT: s_lshl_b32 s13, s14, 31
; GFX10-NEXT: s_mov_b32 s12, s19
-; GFX10-NEXT: s_lshr_b64 s[14:15], s[14:15], 1
-; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13]
; GFX10-NEXT: s_sub_i32 s18, s10, 64
+; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13]
+; GFX10-NEXT: s_lshr_b64 s[12:13], s[14:15], 1
; GFX10-NEXT: s_sub_i32 s11, 64, s10
; GFX10-NEXT: s_cmp_lt_u32 s10, 64
; GFX10-NEXT: s_cselect_b32 s19, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s10, 0
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[12:13], s[4:5], s10
-; GFX10-NEXT: s_lshl_b64 s[16:17], s[14:15], s11
-; GFX10-NEXT: s_lshr_b64 s[10:11], s[14:15], s10
-; GFX10-NEXT: s_or_b64 s[12:13], s[12:13], s[16:17]
-; GFX10-NEXT: s_lshr_b64 s[14:15], s[14:15], s18
+; GFX10-NEXT: s_lshr_b64 s[14:15], s[4:5], s10
+; GFX10-NEXT: s_lshl_b64 s[16:17], s[12:13], s11
+; GFX10-NEXT: s_lshr_b64 s[10:11], s[12:13], s10
+; GFX10-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX10-NEXT: s_lshr_b64 s[12:13], s[12:13], s18
; GFX10-NEXT: s_cmp_lg_u32 s19, 0
-; GFX10-NEXT: s_cselect_b64 s[12:13], s[12:13], s[14:15]
+; GFX10-NEXT: s_cselect_b64 s[12:13], s[14:15], s[12:13]
; GFX10-NEXT: s_cmp_lg_u32 s20, 0
; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[12:13]
; GFX10-NEXT: s_cmp_lg_u32 s19, 0
@@ -6248,24 +6244,24 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
; GFX6-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc
-; GFX6-NEXT: v_lshr_b64 v[6:7], v[12:13], 1
-; GFX6-NEXT: v_lshlrev_b32_e32 v8, 31, v14
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], 1
-; GFX6-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], 1
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 31, v14
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v6
+; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], 1
; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v17
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], v17
-; GFX6-NEXT: v_lshl_b64 v[10:11], v[4:5], v10
+; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v17
+; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v10
; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, 64, v17
; GFX6-NEXT: v_or_b32_e32 v10, v8, v10
; GFX6-NEXT: v_or_b32_e32 v11, v9, v11
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v17
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v12
+; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], v17
+; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v12
; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17
-; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5]
; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
; GFX6-NEXT: v_or_b32_e32 v4, v18, v4
@@ -6340,24 +6336,24 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
; GFX8-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc
-; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[12:13]
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 31, v14
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[14:15]
-; GFX8-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13]
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 31, v14
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v6
+; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15]
; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v17
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], v17, v[6:7]
-; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[4:5]
+; GFX8-NEXT: v_lshrrev_b64 v[8:9], v17, v[4:5]
+; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7]
; GFX8-NEXT: v_subrev_u32_e32 v12, vcc, 64, v17
; GFX8-NEXT: v_or_b32_e32 v10, v8, v10
; GFX8-NEXT: v_or_b32_e32 v11, v9, v11
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], v17, v[4:5]
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v12, v[4:5]
+; GFX8-NEXT: v_lshrrev_b64 v[8:9], v17, v[6:7]
+; GFX8-NEXT: v_lshrrev_b64 v[6:7], v12, v[6:7]
; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
; GFX8-NEXT: v_or_b32_e32 v4, v18, v4
@@ -6425,31 +6421,31 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX9-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5]
; GFX9-NEXT: v_lshlrev_b64 v[4:5], v18, v[4:5]
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v19, 0, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
; GFX9-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[12:13]
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 31, v14
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[14:15]
-; GFX9-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13]
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 31, v14
+; GFX9-NEXT: v_or_b32_e32 v5, v5, v6
+; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15]
; GFX9-NEXT: v_sub_u32_e32 v10, 64, v17
-; GFX9-NEXT: v_lshrrev_b64 v[8:9], v17, v[6:7]
-; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[4:5]
+; GFX9-NEXT: v_lshrrev_b64 v[8:9], v17, v[4:5]
+; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7]
; GFX9-NEXT: v_subrev_u32_e32 v12, 64, v17
; GFX9-NEXT: v_or_b32_e32 v10, v8, v10
; GFX9-NEXT: v_or_b32_e32 v11, v9, v11
-; GFX9-NEXT: v_lshrrev_b64 v[8:9], v17, v[4:5]
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], v12, v[4:5]
+; GFX9-NEXT: v_lshrrev_b64 v[8:9], v17, v[6:7]
+; GFX9-NEXT: v_lshrrev_b64 v[6:7], v12, v[6:7]
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
; GFX9-NEXT: v_or_b32_e32 v4, v18, v4
@@ -6484,14 +6480,14 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v28
; GFX10-NEXT: v_or_b32_e32 v19, v17, v19
; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v28
-; GFX10-NEXT: v_cndmask_b32_e32 v18, v0, v18, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v28
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v27
; GFX10-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11]
; GFX10-NEXT: v_or_b32_e32 v23, v23, v25
; GFX10-NEXT: v_or_b32_e32 v24, v24, v26
; GFX10-NEXT: v_cndmask_b32_e32 v19, v1, v19, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v18, v0, v18, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11]
-; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v28
; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v23, s4
; GFX10-NEXT: v_cndmask_b32_e64 v10, v17, v24, s4
; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v22, vcc_lo
@@ -6499,35 +6495,35 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX10-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, v16, v8, s5
; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v9, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v0, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s6
; GFX10-NEXT: v_and_b32_e32 v23, s7, v20
-; GFX10-NEXT: v_xor_b32_e32 v10, -1, v20
-; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, v1, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v0, s4
; GFX10-NEXT: v_or_b32_e32 v0, v21, v3
+; GFX10-NEXT: v_xor_b32_e32 v3, -1, v20
+; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, v1, s4
; GFX10-NEXT: v_or_b32_e32 v1, v11, v8
+; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v23
; GFX10-NEXT: v_or_b32_e32 v2, v2, v9
; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13]
-; GFX10-NEXT: v_sub_nc_u32_e32 v3, 64, v23
-; GFX10-NEXT: v_and_b32_e32 v25, s7, v10
+; GFX10-NEXT: v_and_b32_e32 v25, s7, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v16, 31, v14
+; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[12:13], v23, v[6:7]
; GFX10-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15]
-; GFX10-NEXT: v_lshrrev_b64 v[10:11], v3, v[4:5]
; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v25
; GFX10-NEXT: v_or_b32_e32 v9, v9, v16
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v23
; GFX10-NEXT: v_lshlrev_b64 v[16:17], v23, v[4:5]
-; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23
; GFX10-NEXT: v_or_b32_e32 v12, v10, v12
; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v25
; GFX10-NEXT: v_lshrrev_b64 v[18:19], v25, v[8:9]
; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15]
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23
; GFX10-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5]
; GFX10-NEXT: v_or_b32_e32 v5, v11, v13
; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[14:15]
-; GFX10-NEXT: v_cndmask_b32_e32 v13, 0, v16, vcc_lo
; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v13, 0, v16, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v16, v18, v20
; GFX10-NEXT: v_or_b32_e32 v18, v19, v21
; GFX10-NEXT: v_cndmask_b32_e32 v12, v3, v12, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 65c0da80e20b1..5a378873ba8dd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -4762,10 +4762,10 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
; GFX6-NEXT: s_mov_b32 s11, 0
; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX6-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
-; GFX6-NEXT: s_lshl_b64 s[14:15], s[0:1], 1
+; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX6-NEXT: s_lshr_b32 s10, s1, 31
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 1
-; GFX6-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX6-NEXT: s_lshl_b64 s[14:15], s[0:1], 1
+; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[10:11]
; GFX6-NEXT: s_sub_i32 s13, s8, 64
; GFX6-NEXT: s_sub_i32 s9, 64, s8
; GFX6-NEXT: s_cmp_lt_u32 s8, 64
@@ -4809,10 +4809,10 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
; GFX8-NEXT: s_mov_b32 s11, 0
; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX8-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
-; GFX8-NEXT: s_lshl_b64 s[14:15], s[0:1], 1
+; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX8-NEXT: s_lshr_b32 s10, s1, 31
-; GFX8-NEXT: s_lshl_b64 s[0:1], s[2:3], 1
-; GFX8-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX8-NEXT: s_lshl_b64 s[14:15], s[0:1], 1
+; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[10:11]
; GFX8-NEXT: s_sub_i32 s13, s8, 64
; GFX8-NEXT: s_sub_i32 s9, 64, s8
; GFX8-NEXT: s_cmp_lt_u32 s8, 64
@@ -4856,10 +4856,10 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
; GFX9-NEXT: s_mov_b32 s11, 0
; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX9-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
-; GFX9-NEXT: s_lshl_b64 s[14:15], s[0:1], 1
+; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX9-NEXT: s_lshr_b32 s10, s1, 31
-; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 1
-; GFX9-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX9-NEXT: s_lshl_b64 s[14:15], s[0:1], 1
+; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[10:11]
; GFX9-NEXT: s_sub_i32 s13, s8, 64
; GFX9-NEXT: s_sub_i32 s9, 64, s8
; GFX9-NEXT: s_cmp_lt_u32 s8, 64
@@ -4906,7 +4906,7 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
; GFX10-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
; GFX10-NEXT: s_lshr_b32 s10, s1, 31
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT: s_or_b64 s[2:3], s[10:11], s[2:3]
+; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11]
; GFX10-NEXT: s_sub_i32 s13, s8, 64
; GFX10-NEXT: s_sub_i32 s9, 64, s8
; GFX10-NEXT: s_cmp_lt_u32 s8, 64
@@ -4958,7 +4958,7 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX6-NEXT: v_and_b32_e32 v15, s4, v8
; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], 1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1
-; GFX6-NEXT: v_or_b32_e32 v2, v0, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v0
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v15
; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], v0
; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], v15
@@ -5007,7 +5007,7 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX8-NEXT: v_and_b32_e32 v15, s4, v8
; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1
-; GFX8-NEXT: v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v0
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v15
; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[8:9]
; GFX8-NEXT: v_lshlrev_b64 v[10:11], v15, v[2:3]
@@ -5056,7 +5056,7 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX9-NEXT: v_and_b32_e32 v15, s4, v8
; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1]
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1
-; GFX9-NEXT: v_or_b32_e32 v2, v0, v2
+; GFX9-NEXT: v_or_b32_e32 v2, v2, v0
; GFX9-NEXT: v_sub_u32_e32 v0, 64, v15
; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[8:9]
; GFX9-NEXT: v_lshlrev_b64 v[10:11], v15, v[2:3]
@@ -5106,7 +5106,7 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX10-NEXT: v_and_b32_e32 v19, s4, v8
; GFX10-NEXT: v_and_b32_e32 v18, s4, v9
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT: v_or_b32_e32 v2, v10, v2
+; GFX10-NEXT: v_or_b32_e32 v2, v2, v10
; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19
; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18
; GFX10-NEXT: v_subrev_nc_u32_e32 v21, 64, v18
@@ -5155,12 +5155,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX6-NEXT: v_and_b32_e32 v6, s8, v0
; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX6-NEXT: v_and_b32_e32 v7, s8, v0
-; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
-; GFX6-NEXT: s_lshr_b32 s8, s1, 31
; GFX6-NEXT: s_mov_b32 s9, 0
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 1
+; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
+; GFX6-NEXT: s_lshr_b32 s8, s1, 31
+; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v7
-; GFX6-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1]
+; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
; GFX6-NEXT: v_lshr_b64 v[0:1], s[10:11], v0
; GFX6-NEXT: v_lshl_b64 v[2:3], s[0:1], v7
; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v7
@@ -5208,12 +5208,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX8-NEXT: v_and_b32_e32 v6, s8, v0
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX8-NEXT: v_and_b32_e32 v7, s8, v0
-; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
-; GFX8-NEXT: s_lshr_b32 s8, s1, 31
; GFX8-NEXT: s_mov_b32 s9, 0
-; GFX8-NEXT: s_lshl_b64 s[0:1], s[2:3], 1
+; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
+; GFX8-NEXT: s_lshr_b32 s8, s1, 31
+; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v7
-; GFX8-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1]
+; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[10:11]
; GFX8-NEXT: v_lshlrev_b64 v[2:3], v7, s[0:1]
; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v7
@@ -5261,12 +5261,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX9-NEXT: v_and_b32_e32 v6, s8, v0
; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX9-NEXT: v_and_b32_e32 v7, s8, v0
-; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
-; GFX9-NEXT: s_lshr_b32 s8, s1, 31
; GFX9-NEXT: s_mov_b32 s9, 0
-; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 1
+; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
+; GFX9-NEXT: s_lshr_b32 s8, s1, 31
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
; GFX9-NEXT: v_sub_u32_e32 v0, 64, v7
-; GFX9-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1]
+; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[10:11]
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, s[0:1]
; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v7
@@ -5319,7 +5319,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v13
-; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[2:3]
+; GFX10-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9]
; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12
; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9]
@@ -5368,10 +5368,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX6-NEXT: s_mov_b32 s7, 0
; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
+; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX6-NEXT: s_lshr_b32 s6, s1, 31
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 1
-; GFX6-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1]
+; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
+; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[6:7]
; GFX6-NEXT: s_sub_i32 s9, s4, 64
; GFX6-NEXT: s_sub_i32 s5, 64, s4
; GFX6-NEXT: s_cmp_lt_u32 s4, 64
@@ -5424,10 +5424,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX8-NEXT: s_mov_b32 s7, 0
; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
+; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX8-NEXT: s_lshr_b32 s6, s1, 31
-; GFX8-NEXT: s_lshl_b64 s[0:1], s[2:3], 1
-; GFX8-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1]
+; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
+; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[6:7]
; GFX8-NEXT: s_sub_i32 s9, s4, 64
; GFX8-NEXT: s_sub_i32 s5, 64, s4
; GFX8-NEXT: s_cmp_lt_u32 s4, 64
@@ -5480,10 +5480,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX9-NEXT: s_mov_b32 s7, 0
; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
+; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX9-NEXT: s_lshr_b32 s6, s1, 31
-; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 1
-; GFX9-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
+; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[6:7]
; GFX9-NEXT: s_sub_i32 s9, s4, 64
; GFX9-NEXT: s_sub_i32 s5, 64, s4
; GFX9-NEXT: s_cmp_lt_u32 s4, 64
@@ -5539,7 +5539,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX10-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX10-NEXT: s_lshr_b32 s6, s1, 31
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3]
+; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
; GFX10-NEXT: s_sub_i32 s9, s4, 64
; GFX10-NEXT: s_sub_i32 s5, 64, s4
; GFX10-NEXT: s_cmp_lt_u32 s4, 64
@@ -5600,7 +5600,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX6-NEXT: v_lshl_b64 v[4:5], v[0:1], 1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1
; GFX6-NEXT: s_cmp_lt_u32 s4, 64
-; GFX6-NEXT: v_or_b32_e32 v2, v0, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v0
; GFX6-NEXT: s_cselect_b32 s7, 1, 0
; GFX6-NEXT: s_cmp_eq_u32 s4, 0
; GFX6-NEXT: s_cselect_b32 s9, 1, 0
@@ -5654,7 +5654,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX8-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1
; GFX8-NEXT: s_cmp_lt_u32 s4, 64
-; GFX8-NEXT: v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v0
; GFX8-NEXT: s_cselect_b32 s7, 1, 0
; GFX8-NEXT: s_cmp_eq_u32 s4, 0
; GFX8-NEXT: s_cselect_b32 s9, 1, 0
@@ -5708,7 +5708,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1]
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1
; GFX9-NEXT: s_cmp_lt_u32 s4, 64
-; GFX9-NEXT: v_or_b32_e32 v2, v0, v2
+; GFX9-NEXT: v_or_b32_e32 v2, v2, v0
; GFX9-NEXT: s_cselect_b32 s7, 1, 0
; GFX9-NEXT: s_cmp_eq_u32 s4, 0
; GFX9-NEXT: s_cselect_b32 s9, 1, 0
@@ -5759,7 +5759,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; GFX10-NEXT: s_andn2_b64 s[8:9], s[6:7], s[4:5]
; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
-; GFX10-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX10-NEXT: v_or_b32_e32 v2, v2, v4
; GFX10-NEXT: s_sub_i32 s4, 64, s8
; GFX10-NEXT: s_sub_i32 s5, s8, 64
; GFX10-NEXT: s_cmp_lt_u32 s8, 64
@@ -5815,9 +5815,9 @@ define amdgpu_ps i128 @s_fshr_i128_65(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-NEXT: s_mov_b32 s4, 0
; GFX6-NEXT: s_lshl_b32 s5, s0, 31
; GFX6-NEXT: s_lshl_b32 s3, s2, 31
-; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
; GFX6-NEXT: s_mov_b32 s2, s4
-; GFX6-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3]
+; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
+; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1]
; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], 1
; GFX6-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
; GFX6-NEXT: ; return to shader part epilog
@@ -5827,9 +5827,9 @@ define amdgpu_ps i128 @s_fshr_i128_65(i128 inreg %lhs, i128 inreg %rhs) {
; GFX8-NEXT: s_mov_b32 s4, 0
; GFX8-NEXT: s_lshl_b32 s5, s0, 31
; GFX8-NEXT: s_lshl_b32 s3, s2, 31
-; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
; GFX8-NEXT: s_mov_b32 s2, s4
-; GFX8-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3]
+; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
+; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1]
; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], 1
; GFX8-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
; GFX8-NEXT: ; return to shader part epilog
@@ -5839,9 +5839,9 @@ define amdgpu_ps i128 @s_fshr_i128_65(i128 inreg %lhs, i128 inreg %rhs) {
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: s_lshl_b32 s5, s0, 31
; GFX9-NEXT: s_lshl_b32 s3, s2, 31
-; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
; GFX9-NEXT: s_mov_b32 s2, s4
-; GFX9-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3]
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
+; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1]
; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], 1
; GFX9-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
; GFX9-NEXT: ; return to shader part epilog
@@ -5851,11 +5851,11 @@ define amdgpu_ps i128 @s_fshr_i128_65(i128 inreg %lhs, i128 inreg %rhs) {
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_lshl_b32 s3, s2, 31
; GFX10-NEXT: s_lshl_b32 s5, s0, 31
-; GFX10-NEXT: s_lshr_b64 s[8:9], s[0:1], 1
-; GFX10-NEXT: s_lshr_b64 s[0:1], s[6:7], 1
; GFX10-NEXT: s_mov_b32 s2, s4
-; GFX10-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
-; GFX10-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3]
+; GFX10-NEXT: s_lshr_b64 s[8:9], s[0:1], 1
+; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1
+; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX10-NEXT: s_or_b64 s[0:1], s[4:5], s[6:7]
; GFX10-NEXT: ; return to shader part epilog
%result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 65)
ret i128 %result
@@ -5865,37 +5865,34 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
; GFX6-LABEL: v_fshr_i128_65:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 31, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 31, v2
; GFX6-NEXT: v_lshr_b64 v[2:3], v[0:1], 1
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 31, v0
; GFX6-NEXT: v_lshr_b64 v[0:1], v[6:7], 1
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 31, v4
-; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX6-NEXT: v_or_b32_e32 v1, v5, v1
+; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v4, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshr_i128_65:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 31, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 31, v2
; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[0:1]
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 31, v0
; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[6:7]
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 31, v4
-; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX8-NEXT: v_or_b32_e32 v1, v5, v1
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshr_i128_65:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 31, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 31, v2
; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[0:1]
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 31, v0
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[6:7]
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 31, v4
-; GFX9-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX9-NEXT: v_or_b32_e32 v1, v5, v1
+; GFX9-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX9-NEXT: v_or_b32_e32 v1, v4, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fshr_i128_65:
@@ -5908,7 +5905,7 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 31, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 31, v8
; GFX10-NEXT: v_or_b32_e32 v1, v9, v5
-; GFX10-NEXT: v_or_b32_e32 v3, v3, v0
+; GFX10-NEXT: v_or_b32_e32 v3, v0, v3
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 65)
@@ -5924,9 +5921,9 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX6-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
; GFX6-NEXT: s_lshl_b64 s[24:25], s[0:1], 1
; GFX6-NEXT: s_lshr_b32 s0, s1, 31
-; GFX6-NEXT: s_mov_b32 s1, s19
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT: s_mov_b32 s1, s19
+; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX6-NEXT: s_sub_i32 s23, s16, 64
; GFX6-NEXT: s_sub_i32 s17, 64, s16
; GFX6-NEXT: s_cmp_lt_u32 s16, 64
@@ -5964,10 +5961,10 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX6-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11]
; GFX6-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21]
; GFX6-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], 1
+; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
; GFX6-NEXT: s_lshr_b32 s18, s5, 31
-; GFX6-NEXT: s_lshl_b64 s[4:5], s[6:7], 1
-; GFX6-NEXT: s_or_b64 s[4:5], s[18:19], s[4:5]
+; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], 1
+; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[18:19]
; GFX6-NEXT: s_sub_i32 s9, s10, 64
; GFX6-NEXT: s_sub_i32 s11, 64, s10
; GFX6-NEXT: s_cmp_lt_u32 s10, 64
@@ -6013,9 +6010,9 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX8-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
; GFX8-NEXT: s_lshl_b64 s[24:25], s[0:1], 1
; GFX8-NEXT: s_lshr_b32 s0, s1, 31
-; GFX8-NEXT: s_mov_b32 s1, s19
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT: s_mov_b32 s1, s19
+; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX8-NEXT: s_sub_i32 s23, s16, 64
; GFX8-NEXT: s_sub_i32 s17, 64, s16
; GFX8-NEXT: s_cmp_lt_u32 s16, 64
@@ -6053,10 +6050,10 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX8-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11]
; GFX8-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21]
; GFX8-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], 1
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
; GFX8-NEXT: s_lshr_b32 s18, s5, 31
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 1
-; GFX8-NEXT: s_or_b64 s[4:5], s[18:19], s[4:5]
+; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], 1
+; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[18:19]
; GFX8-NEXT: s_sub_i32 s9, s10, 64
; GFX8-NEXT: s_sub_i32 s11, 64, s10
; GFX8-NEXT: s_cmp_lt_u32 s10, 64
@@ -6102,9 +6099,9 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX9-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
; GFX9-NEXT: s_lshl_b64 s[24:25], s[0:1], 1
; GFX9-NEXT: s_lshr_b32 s0, s1, 31
-; GFX9-NEXT: s_mov_b32 s1, s19
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_mov_b32 s1, s19
+; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX9-NEXT: s_sub_i32 s23, s16, 64
; GFX9-NEXT: s_sub_i32 s17, 64, s16
; GFX9-NEXT: s_cmp_lt_u32 s16, 64
@@ -6142,10 +6139,10 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX9-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11]
; GFX9-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21]
; GFX9-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], 1
+; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
; GFX9-NEXT: s_lshr_b32 s18, s5, 31
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 1
-; GFX9-NEXT: s_or_b64 s[4:5], s[18:19], s[4:5]
+; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], 1
+; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[18:19]
; GFX9-NEXT: s_sub_i32 s9, s10, 64
; GFX9-NEXT: s_sub_i32 s11, 64, s10
; GFX9-NEXT: s_cmp_lt_u32 s10, 64
@@ -6190,10 +6187,10 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX10-NEXT: s_lshr_b32 s24, s1, 31
; GFX10-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
-; GFX10-NEXT: s_mov_b32 s25, s19
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
+; GFX10-NEXT: s_mov_b32 s25, s19
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT: s_or_b64 s[2:3], s[24:25], s[2:3]
+; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[24:25]
; GFX10-NEXT: s_sub_i32 s23, s16, 64
; GFX10-NEXT: s_sub_i32 s17, 64, s16
; GFX10-NEXT: s_cmp_lt_u32 s16, 64
@@ -6234,7 +6231,7 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX10-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
; GFX10-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1]
; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], 1
-; GFX10-NEXT: s_or_b64 s[6:7], s[18:19], s[6:7]
+; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[18:19]
; GFX10-NEXT: s_sub_i32 s9, s10, 64
; GFX10-NEXT: s_sub_i32 s11, 64, s10
; GFX10-NEXT: s_cmp_lt_u32 s10, 64
@@ -6285,7 +6282,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX6-NEXT: v_and_b32_e32 v23, s6, v17
; GFX6-NEXT: v_lshrrev_b32_e32 v17, 31, v1
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
-; GFX6-NEXT: v_or_b32_e32 v2, v17, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v17
; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23
; GFX6-NEXT: v_and_b32_e32 v24, s6, v16
; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17
@@ -6328,7 +6325,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX6-NEXT: v_and_b32_e32 v17, s6, v8
; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], 1
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5
-; GFX6-NEXT: v_or_b32_e32 v6, v4, v6
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v4
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v17
; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], v4
; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v17
@@ -6377,7 +6374,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX8-NEXT: v_and_b32_e32 v23, s6, v17
; GFX8-NEXT: v_lshrrev_b32_e32 v17, 31, v1
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX8-NEXT: v_or_b32_e32 v2, v17, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v17
; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23
; GFX8-NEXT: v_and_b32_e32 v24, s6, v16
; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1]
@@ -6420,7 +6417,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX8-NEXT: v_and_b32_e32 v17, s6, v8
; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5
-; GFX8-NEXT: v_or_b32_e32 v6, v4, v6
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v4
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v17
; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9]
; GFX8-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7]
@@ -6469,7 +6466,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX9-NEXT: v_and_b32_e32 v23, s6, v17
; GFX9-NEXT: v_lshrrev_b32_e32 v17, 31, v1
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT: v_or_b32_e32 v2, v17, v2
+; GFX9-NEXT: v_or_b32_e32 v2, v2, v17
; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23
; GFX9-NEXT: v_and_b32_e32 v24, s6, v16
; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1]
@@ -6512,7 +6509,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX9-NEXT: v_or_b32_e32 v3, v16, v9
; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5]
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5
-; GFX9-NEXT: v_or_b32_e32 v6, v4, v6
+; GFX9-NEXT: v_or_b32_e32 v6, v6, v4
; GFX9-NEXT: v_sub_u32_e32 v4, 64, v17
; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9]
; GFX9-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7]
@@ -6567,7 +6564,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX10-NEXT: v_subrev_nc_u32_e32 v27, 64, v26
; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v26
; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v25
-; GFX10-NEXT: v_or_b32_e32 v2, v17, v2
+; GFX10-NEXT: v_or_b32_e32 v2, v2, v17
; GFX10-NEXT: v_subrev_nc_u32_e32 v19, 64, v25
; GFX10-NEXT: v_lshlrev_b64 v[23:24], v25, v[0:1]
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25
@@ -6600,7 +6597,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX10-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5]
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v0, v23, v0
-; GFX10-NEXT: v_or_b32_e32 v6, v8, v6
+; GFX10-NEXT: v_or_b32_e32 v6, v6, v8
; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v25
; GFX10-NEXT: v_and_b32_e32 v23, s5, v20
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s4
diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
index 96e7838faf03d..7b233b83c49be 100644
--- a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
@@ -4054,4 +4054,47 @@ TEST_F(AArch64GISelMITest, moreElementsShuffle) {
EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF;
}
+// Test narror scalar of G_SHL with constant shift amount
+TEST_F(AArch64GISelMITest, narrowScalarShiftByConstant) {
+ setUp();
+ if (!TM)
+ return;
+
+ DefineLegalizerInfo(A, {});
+
+ LLT S64{LLT::scalar(64)};
+ LLT S32{LLT::scalar(32)};
+
+ auto Constant = B.buildConstant(S64, 33);
+ auto Trunc = B.buildTrunc(S32, Constant);
+ auto Shift = B.buildShl(S64, Copies[0], Trunc);
+
+ AInfo Info(MF->getSubtarget());
+ DummyGISelObserver Observer;
+ LegalizerHelper Helper(*MF, Info, Observer, B);
+
+ // Perform Legalization
+ B.setInsertPt(*EntryMBB, Shift->getIterator());
+
+ // This should detect the G_CONSTANT feeding the G_SHL through a G_TRUNC
+ EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized,
+ Helper.narrowScalarShift(*Shift, 0, S32));
+
+ const auto *CheckStr = R"(
+ CHECK: [[COPY0:%[0-9]+]]:_(s64) = COPY
+ CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY
+ CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY
+ CHECK: [[THIRTY3:%[0-9]+]]:_(s64) = G_CONSTANT i64 33
+ CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %4:_(s64)
+ CHECK: [[UNMERGE:%[0-9]+]]:_(s32), [[UNMERGE2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY0]]
+ CHECK: [[ZERO:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ CHECK: [[ONE:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ CHECK: [[SHIFT:%[0-9]+]]:_(s32) = G_SHL [[UNMERGE]]:_, [[ONE]]:_(s32)
+ CHECK: [[MERGE:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[ZERO]]:_(s32), [[SHIFT]]:_(s32)
+ )";
+
+ // Check
+ EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF;
+}
+
} // namespace
More information about the llvm-commits
mailing list