[llvm] 3ad5216 - [AMDGPU] Better codegen for i64 bitreverse
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 26 07:51:48 PST 2021
Author: Jay Foad
Date: 2021-02-26T15:51:36Z
New Revision: 3ad5216ed88e303cb5d37864bb83b0eec81144af
URL: https://github.com/llvm/llvm-project/commit/3ad5216ed88e303cb5d37864bb83b0eec81144af
DIFF: https://github.com/llvm/llvm-project/commit/3ad5216ed88e303cb5d37864bb83b0eec81144af.diff
LOG: [AMDGPU] Better codegen for i64 bitreverse
Differential Revision: https://reviews.llvm.org/D97547
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.h
llvm/lib/Target/AMDGPU/SOPInstructions.td
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bitreverse.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitreverse.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bitreverse.mir
llvm/test/CodeGen/AMDGPU/bitreverse.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 931ce99088235..2925f70ece624 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -935,10 +935,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.widenScalarToNextPow2(0, 32)
.widenScalarToNextPow2(1, 32);
+ // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
+ // RegBankSelect.
getActionDefinitionsBuilder(G_BITREVERSE)
- .legalFor({S32})
- .clampScalar(0, S32, S32)
- .scalarize(0);
+ .legalFor({S32, S64})
+ .clampScalar(0, S32, S64)
+ .scalarize(0)
+ .widenScalarToNextPow2(0);
if (ST.has16BitInsts()) {
getActionDefinitionsBuilder(G_BSWAP)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 59279b6819fa5..ae564d87964ee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2393,6 +2393,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
case AMDGPU::G_CTPOP:
+ case AMDGPU::G_BITREVERSE:
case AMDGPU::G_CTLZ_ZERO_UNDEF:
case AMDGPU::G_CTTZ_ZERO_UNDEF: {
const RegisterBank *DstBank =
@@ -3607,10 +3608,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
break;
}
+ case AMDGPU::G_BITREVERSE:
case AMDGPU::G_BITCAST:
case AMDGPU::G_INTTOPTR:
case AMDGPU::G_PTRTOINT:
- case AMDGPU::G_BITREVERSE:
case AMDGPU::G_FABS:
case AMDGPU::G_FNEG: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6353659b0af8b..0eaec89b8b1f7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -388,6 +388,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+ setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
// FIXME: This should be narrowed to i32, but that only happens if i64 is
// illegal.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 46e213b39eb60..b8abd6d396e0e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5742,6 +5742,11 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
Inst.eraseFromParent();
continue;
+ case AMDGPU::S_BREV_B64:
+ splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
+ Inst.eraseFromParent();
+ continue;
+
case AMDGPU::S_NOT_B64:
splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
Inst.eraseFromParent();
@@ -6292,7 +6297,7 @@ void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
void SIInstrInfo::splitScalar64BitUnaryOp(
SetVectorType &Worklist, MachineInstr &Inst,
- unsigned Opcode) const {
+ unsigned Opcode, bool Swap) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -6325,6 +6330,9 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
+ if (Swap)
+ std::swap(DestSub0, DestSub1);
+
Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 49cfae590e4b5..b5a597c975c42 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -96,7 +96,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
unsigned Opcode) const;
void splitScalar64BitUnaryOp(SetVectorType &Worklist,
- MachineInstr &Inst, unsigned Opcode) const;
+ MachineInstr &Inst, unsigned Opcode,
+ bool Swap = false) const;
void splitScalar64BitAddSub(SetVectorType &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT = nullptr) const;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 7426af931a621..50725dea2e15e 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -195,7 +195,9 @@ def : GCNPat <
def S_BREV_B32 : SOP1_32 <"s_brev_b32",
[(set i32:$sdst, (bitreverse i32:$src0))]
>;
-def S_BREV_B64 : SOP1_64 <"s_brev_b64">;
+def S_BREV_B64 : SOP1_64 <"s_brev_b64",
+ [(set i64:$sdst, (bitreverse i64:$src0))]
+>;
let Defs = [SCC] in {
def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bitreverse.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bitreverse.mir
index a99e602a1535f..eaa9a375cabe4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bitreverse.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bitreverse.mir
@@ -51,3 +51,68 @@ body: |
%1:vgpr(s32) = G_BITREVERSE %0
S_ENDPGM 0, implicit %1
...
+
+---
+name: bitreverse_i64_ss
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; CHECK-LABEL: name: bitreverse_i64_ss
+ ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; CHECK: [[S_BREV_B64_:%[0-9]+]]:sreg_64 = S_BREV_B64 [[COPY]]
+ ; CHECK: S_ENDPGM 0, implicit [[S_BREV_B64_]]
+ %0:sgpr(s64) = COPY $sgpr0_sgpr1
+ %1:sgpr(s64) = G_BITREVERSE %0
+ S_ENDPGM 0, implicit %1
+...
+
+---
+name: bitreverse_i64_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; CHECK-LABEL: name: bitreverse_i64_vv
+ ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+ ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+ ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+ ; CHECK: [[V_BFREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e64 [[COPY2]], implicit $exec
+ ; CHECK: [[V_BFREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e64 [[COPY1]], implicit $exec
+ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_BFREV_B32_e64_]], %subreg.sub0, [[V_BFREV_B32_e64_1]], %subreg.sub1
+ ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+ %0:vgpr(s64) = COPY $vgpr0_vgpr1
+ %2:vgpr(s32), %3:vgpr(s32) = G_UNMERGE_VALUES %0(s64)
+ %4:vgpr(s32) = G_BITREVERSE %3
+ %5:vgpr(s32) = G_BITREVERSE %2
+ %1:vgpr(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+ S_ENDPGM 0, implicit %1
+...
+
+---
+name: bitreverse_i64_vs
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; CHECK-LABEL: name: bitreverse_i64_vs
+ ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+ ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+ ; CHECK: [[V_BFREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e64 [[COPY2]], implicit $exec
+ ; CHECK: [[V_BFREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e64 [[COPY1]], implicit $exec
+ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_BFREV_B32_e64_]], %subreg.sub0, [[V_BFREV_B32_e64_1]], %subreg.sub1
+ ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+ %0:sgpr(s64) = COPY $sgpr0_sgpr1
+ %2:sgpr(s32), %3:sgpr(s32) = G_UNMERGE_VALUES %0(s64)
+ %4:vgpr(s32) = G_BITREVERSE %3
+ %5:vgpr(s32) = G_BITREVERSE %2
+ %1:vgpr(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+ S_ENDPGM 0, implicit %1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitreverse.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitreverse.mir
index 87b468ce6c6a9..c365bdcbfab75 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitreverse.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitreverse.mir
@@ -136,11 +136,8 @@ body: |
liveins: $vgpr0_vgpr1
; CHECK-LABEL: name: bitreverse_s64
; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
- ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
- ; CHECK: [[BITREVERSE:%[0-9]+]]:_(s32) = G_BITREVERSE [[UV1]]
- ; CHECK: [[BITREVERSE1:%[0-9]+]]:_(s32) = G_BITREVERSE [[UV]]
- ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[BITREVERSE]](s32), [[BITREVERSE1]](s32)
- ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64)
+ ; CHECK: [[BITREVERSE:%[0-9]+]]:_(s64) = G_BITREVERSE [[COPY]]
+ ; CHECK: $vgpr0_vgpr1 = COPY [[BITREVERSE]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = G_BITREVERSE %0
$vgpr0_vgpr1 = COPY %1
@@ -155,15 +152,9 @@ body: |
; CHECK-LABEL: name: bitreverse_v2s64
; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
- ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
- ; CHECK: [[BITREVERSE:%[0-9]+]]:_(s32) = G_BITREVERSE [[UV3]]
- ; CHECK: [[BITREVERSE1:%[0-9]+]]:_(s32) = G_BITREVERSE [[UV2]]
- ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[BITREVERSE]](s32), [[BITREVERSE1]](s32)
- ; CHECK: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
- ; CHECK: [[BITREVERSE2:%[0-9]+]]:_(s32) = G_BITREVERSE [[UV5]]
- ; CHECK: [[BITREVERSE3:%[0-9]+]]:_(s32) = G_BITREVERSE [[UV4]]
- ; CHECK: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[BITREVERSE2]](s32), [[BITREVERSE3]](s32)
- ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
+ ; CHECK: [[BITREVERSE:%[0-9]+]]:_(s64) = G_BITREVERSE [[UV]]
+ ; CHECK: [[BITREVERSE1:%[0-9]+]]:_(s64) = G_BITREVERSE [[UV1]]
+ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[BITREVERSE]](s64), [[BITREVERSE1]](s64)
; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
%0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
%1:_(<2 x s64>) = G_BITREVERSE %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bitreverse.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bitreverse.mir
index 15f471102c0d9..f5cb09eec3665 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bitreverse.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bitreverse.mir
@@ -29,3 +29,34 @@ body: |
%0:_(s32) = COPY $vgpr0
%1:_(s32) = G_BITREVERSE %0
...
+
+---
+name: bitreverse_i64_s
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0
+ ; CHECK-LABEL: name: bitreverse_i64_s
+ ; CHECK: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+ ; CHECK: [[BITREVERSE:%[0-9]+]]:sgpr(s64) = G_BITREVERSE [[COPY]]
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s64) = G_BITREVERSE %0
+...
+
+---
+name: bitreverse_i64_v
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; CHECK-LABEL: name: bitreverse_i64_v
+ ; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+ ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; CHECK: [[BITREVERSE:%[0-9]+]]:vgpr(s32) = G_BITREVERSE [[UV1]]
+ ; CHECK: [[BITREVERSE1:%[0-9]+]]:vgpr(s32) = G_BITREVERSE [[UV]]
+ ; CHECK: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[BITREVERSE]](s32), [[BITREVERSE1]](s32)
+ %0:_(s64) = COPY $vgpr0_vgpr1
+ %1:_(s64) = G_BITREVERSE %0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
index d6dc59a60dbf0..99ccc1d453193 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -2,6 +2,7 @@
; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT
; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=GISEL
declare i32 @llvm.amdgcn.workitem.id.x() #1
@@ -41,6 +42,20 @@ define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val)
; FLAT-NEXT: v_mov_b32_e32 v0, s0
; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0
; FLAT-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_brev_i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dword s0, s[0:1], 0x2c
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: s_and_b32 s0, s0, 0xffff
+; GISEL-NEXT: s_brev_b32 s0, s0
+; GISEL-NEXT: s_lshr_b32 s0, s0, 16
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-NEXT: flat_store_short v[0:1], v2
+; GISEL-NEXT: s_endpgm
%brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
store i16 %brev, i16 addrspace(1)* %out
ret void
@@ -78,6 +93,22 @@ define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrsp
; FLAT-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0
; FLAT-NEXT: s_endpgm
+;
+; GISEL-LABEL: v_brev_i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: flat_load_ushort v0, v[0:1]
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_bfrev_b32_e32 v0, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-NEXT: flat_store_short v[0:1], v2
+; GISEL-NEXT: s_endpgm
%val = load i16, i16 addrspace(1)* %valptr
%brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
store i16 %brev, i16 addrspace(1)* %out
@@ -108,6 +139,18 @@ define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val)
; FLAT-NEXT: v_mov_b32_e32 v0, s0
; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0
; FLAT-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_brev_i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dword s0, s[0:1], 0x2c
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: s_brev_b32 s0, s0
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-NEXT: flat_store_dword v[0:1], v2
+; GISEL-NEXT: s_endpgm
%brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
store i32 %brev, i32 addrspace(1)* %out
ret void
@@ -147,6 +190,24 @@ define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrsp
; FLAT-NEXT: v_bfrev_b32_e32 v0, v0
; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0
; FLAT-NEXT: s_endpgm
+;
+; GISEL-LABEL: v_brev_i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT: flat_load_dword v0, v[0:1]
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_bfrev_b32_e32 v2, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-NEXT: flat_store_dword v[0:1], v2
+; GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
%val = load i32, i32 addrspace(1)* %gep
@@ -183,6 +244,20 @@ define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; FLAT-NEXT: v_mov_b32_e32 v1, s1
; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; FLAT-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_brev_v2i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-NEXT: s_brev_b32 s0, s0
+; GISEL-NEXT: s_brev_b32 s1, s1
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GISEL-NEXT: s_endpgm
%brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
ret void
@@ -224,6 +299,25 @@ define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; FLAT-NEXT: v_bfrev_b32_e32 v0, v0
; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; FLAT-NEXT: s_endpgm
+;
+; GISEL-LABEL: v_brev_v2i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_bfrev_b32_e32 v0, v0
+; GISEL-NEXT: v_bfrev_b32_e32 v1, v1
+; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
%val = load <2 x i32>, <2 x i32> addrspace(1)* %gep
@@ -235,93 +329,42 @@ define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
; SI-LABEL: s_brev_i64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s4, 0xff00ff
-; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_alignbit_b32 v0, s2, s2, 8
-; SI-NEXT: v_alignbit_b32 v1, s2, s2, 24
-; SI-NEXT: v_alignbit_b32 v2, s3, s3, 8
-; SI-NEXT: v_alignbit_b32 v3, s3, s3, 24
-; SI-NEXT: v_bfi_b32 v4, s4, v1, v0
-; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f
-; SI-NEXT: v_bfi_b32 v2, s4, v3, v2
-; SI-NEXT: v_and_b32_e32 v1, s2, v4
-; SI-NEXT: v_and_b32_e32 v0, s2, v2
-; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0
-; SI-NEXT: v_and_b32_e32 v3, s2, v4
-; SI-NEXT: v_and_b32_e32 v2, s2, v2
-; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4
-; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4
-; SI-NEXT: s_mov_b32 s2, 0x33333333
-; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_or_b32_e32 v3, v3, v1
-; SI-NEXT: v_and_b32_e32 v1, s2, v3
-; SI-NEXT: v_and_b32_e32 v0, s2, v2
-; SI-NEXT: s_mov_b32 s2, 0xcccccccc
-; SI-NEXT: v_and_b32_e32 v3, s2, v3
-; SI-NEXT: v_and_b32_e32 v2, s2, v2
-; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
-; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2
-; SI-NEXT: s_mov_b32 s2, 0x55555555
-; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_or_b32_e32 v3, v3, v1
-; SI-NEXT: v_and_b32_e32 v1, s2, v3
-; SI-NEXT: v_and_b32_e32 v0, s2, v2
-; SI-NEXT: s_mov_b32 s2, 0xaaaaaaaa
-; SI-NEXT: v_and_b32_e32 v3, s2, v3
-; SI-NEXT: v_and_b32_e32 v2, s2, v2
-; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
-; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_or_b32_e32 v0, v2, v0
-; SI-NEXT: v_or_b32_e32 v1, v3, v1
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_brev_b64 s[0:1], s[0:1]
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; FLAT-LABEL: s_brev_i64:
; FLAT: ; %bb.0:
-; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; FLAT-NEXT: v_mov_b32_e32 v0, 0x10203
-; FLAT-NEXT: s_mov_b32 s4, 0xf0f0f0f
-; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; FLAT-NEXT: s_mov_b32 s7, 0xf000
+; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; FLAT-NEXT: v_perm_b32 v2, 0, s2, v0
-; FLAT-NEXT: v_perm_b32 v4, 0, s3, v0
-; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0
-; FLAT-NEXT: v_and_b32_e32 v1, s4, v2
-; FLAT-NEXT: v_and_b32_e32 v0, s4, v4
-; FLAT-NEXT: v_and_b32_e32 v3, s2, v2
-; FLAT-NEXT: v_and_b32_e32 v2, s2, v4
-; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1]
-; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3]
-; FLAT-NEXT: s_mov_b32 s2, 0x33333333
-; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
-; FLAT-NEXT: v_or_b32_e32 v3, v3, v1
-; FLAT-NEXT: v_and_b32_e32 v1, s2, v3
-; FLAT-NEXT: v_and_b32_e32 v0, s2, v2
-; FLAT-NEXT: s_mov_b32 s2, 0xcccccccc
-; FLAT-NEXT: v_and_b32_e32 v3, s2, v3
-; FLAT-NEXT: v_and_b32_e32 v2, s2, v2
-; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3]
-; FLAT-NEXT: s_mov_b32 s2, 0x55555555
-; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
-; FLAT-NEXT: v_or_b32_e32 v3, v3, v1
-; FLAT-NEXT: v_and_b32_e32 v1, s2, v3
-; FLAT-NEXT: v_and_b32_e32 v0, s2, v2
-; FLAT-NEXT: s_mov_b32 s2, 0xaaaaaaaa
-; FLAT-NEXT: v_and_b32_e32 v3, s2, v3
-; FLAT-NEXT: v_and_b32_e32 v2, s2, v2
-; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
-; FLAT-NEXT: s_mov_b32 s3, 0xf000
-; FLAT-NEXT: s_mov_b32 s2, -1
-; FLAT-NEXT: v_or_b32_e32 v0, v2, v0
-; FLAT-NEXT: v_or_b32_e32 v1, v3, v1
-; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; FLAT-NEXT: s_brev_b64 s[0:1], s[0:1]
+; FLAT-NEXT: v_mov_b32_e32 v0, s0
+; FLAT-NEXT: v_mov_b32_e32 v1, s1
+; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; FLAT-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_brev_i64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-NEXT: s_brev_b64 s[0:1], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GISEL-NEXT: s_endpgm
%brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
store i64 %brev, i64 addrspace(1)* %out
ret void
@@ -339,46 +382,11 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrsp
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
-; SI-NEXT: s_mov_b32 s0, 0xff00ff
-; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f
-; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0
-; SI-NEXT: s_mov_b32 s3, 0x33333333
-; SI-NEXT: s_mov_b32 s6, 0xcccccccc
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8
-; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24
-; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8
-; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
-; SI-NEXT: v_bfi_b32 v2, s0, v0, v2
-; SI-NEXT: v_bfi_b32 v4, s0, v1, v3
-; SI-NEXT: v_and_b32_e32 v1, s1, v2
-; SI-NEXT: v_and_b32_e32 v0, s1, v4
-; SI-NEXT: v_and_b32_e32 v3, s2, v2
-; SI-NEXT: v_and_b32_e32 v2, s2, v4
-; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4
-; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4
-; SI-NEXT: s_mov_b32 s0, 0x55555555
-; SI-NEXT: v_or_b32_e32 v3, v3, v1
-; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_and_b32_e32 v1, s3, v3
-; SI-NEXT: v_and_b32_e32 v0, s3, v2
-; SI-NEXT: v_and_b32_e32 v3, s6, v3
-; SI-NEXT: v_and_b32_e32 v2, s6, v2
-; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
-; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2
-; SI-NEXT: s_mov_b32 s1, 0xaaaaaaaa
-; SI-NEXT: v_or_b32_e32 v3, v3, v1
-; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_and_b32_e32 v1, s0, v3
-; SI-NEXT: v_and_b32_e32 v0, s0, v2
-; SI-NEXT: v_and_b32_e32 v3, s1, v3
-; SI-NEXT: v_and_b32_e32 v2, s1, v2
-; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
-; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_or_b32_e32 v1, v3, v1
-; SI-NEXT: v_or_b32_e32 v0, v2, v0
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_bfrev_b32_e32 v2, v0
+; SI-NEXT: v_bfrev_b32_e32 v1, v1
+; SI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; FLAT-LABEL: v_brev_i64:
@@ -386,49 +394,37 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrsp
; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0
-; FLAT-NEXT: s_mov_b32 s3, 0x33333333
-; FLAT-NEXT: s_mov_b32 s6, 0xcccccccc
+; FLAT-NEXT: s_mov_b32 s7, 0xf000
+; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
; FLAT-NEXT: v_mov_b32_e32 v1, s1
; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; FLAT-NEXT: s_mov_b32 s0, 0x10203
-; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f
-; FLAT-NEXT: s_mov_b32 s7, 0xf000
; FLAT-NEXT: s_waitcnt vmcnt(0)
-; FLAT-NEXT: v_perm_b32 v2, 0, v0, s0
-; FLAT-NEXT: v_perm_b32 v4, 0, v1, s0
-; FLAT-NEXT: v_and_b32_e32 v1, s1, v2
-; FLAT-NEXT: v_and_b32_e32 v0, s1, v4
-; FLAT-NEXT: v_and_b32_e32 v3, s2, v2
-; FLAT-NEXT: v_and_b32_e32 v2, s2, v4
-; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1]
-; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3]
-; FLAT-NEXT: s_mov_b32 s0, 0x55555555
-; FLAT-NEXT: v_or_b32_e32 v3, v3, v1
-; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
-; FLAT-NEXT: v_and_b32_e32 v1, s3, v3
-; FLAT-NEXT: v_and_b32_e32 v0, s3, v2
-; FLAT-NEXT: v_and_b32_e32 v3, s6, v3
-; FLAT-NEXT: v_and_b32_e32 v2, s6, v2
-; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3]
-; FLAT-NEXT: s_mov_b32 s1, 0xaaaaaaaa
-; FLAT-NEXT: v_or_b32_e32 v3, v3, v1
-; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
-; FLAT-NEXT: v_and_b32_e32 v1, s0, v3
-; FLAT-NEXT: v_and_b32_e32 v0, s0, v2
-; FLAT-NEXT: v_and_b32_e32 v3, s1, v3
-; FLAT-NEXT: v_and_b32_e32 v2, s1, v2
-; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
-; FLAT-NEXT: s_mov_b32 s6, -1
-; FLAT-NEXT: v_or_b32_e32 v1, v3, v1
-; FLAT-NEXT: v_or_b32_e32 v0, v2, v0
-; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; FLAT-NEXT: v_bfrev_b32_e32 v2, v0
+; FLAT-NEXT: v_bfrev_b32_e32 v1, v1
+; FLAT-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0
; FLAT-NEXT: s_endpgm
+;
+; GISEL-LABEL: v_brev_i64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v4, s3
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v3, s2
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_bfrev_b32_e32 v1, v1
+; GISEL-NEXT: v_bfrev_b32_e32 v2, v0
+; GISEL-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
+; GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid
%val = load i64, i64 addrspace(1)* %gep
@@ -442,76 +438,15 @@ define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
-; SI-NEXT: s_mov_b32 s8, 0xff00ff
-; SI-NEXT: s_mov_b32 s9, 0x33333333
-; SI-NEXT: s_mov_b32 s10, 0xcccccccc
-; SI-NEXT: s_mov_b32 s11, 0x55555555
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_alignbit_b32 v0, s2, s2, 8
-; SI-NEXT: v_alignbit_b32 v1, s2, s2, 24
-; SI-NEXT: v_bfi_b32 v3, s8, v1, v0
-; SI-NEXT: v_alignbit_b32 v2, s3, s3, 8
-; SI-NEXT: v_alignbit_b32 v0, s3, s3, 24
-; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f
-; SI-NEXT: v_bfi_b32 v2, s8, v0, v2
-; SI-NEXT: s_mov_b32 s3, 0xf0f0f0f0
-; SI-NEXT: v_and_b32_e32 v0, s2, v2
-; SI-NEXT: v_and_b32_e32 v1, s2, v3
-; SI-NEXT: v_and_b32_e32 v2, s3, v2
-; SI-NEXT: v_and_b32_e32 v3, s3, v3
-; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4
-; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4
-; SI-NEXT: v_alignbit_b32 v4, s0, s0, 8
-; SI-NEXT: v_alignbit_b32 v5, s0, s0, 24
-; SI-NEXT: v_bfi_b32 v7, s8, v5, v4
-; SI-NEXT: v_alignbit_b32 v4, s1, s1, 8
-; SI-NEXT: v_alignbit_b32 v5, s1, s1, 24
-; SI-NEXT: v_bfi_b32 v6, s8, v5, v4
-; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_or_b32_e32 v3, v3, v1
-; SI-NEXT: v_and_b32_e32 v0, s9, v2
-; SI-NEXT: v_and_b32_e32 v1, s9, v3
-; SI-NEXT: v_and_b32_e32 v4, s2, v6
-; SI-NEXT: v_and_b32_e32 v5, s2, v7
-; SI-NEXT: v_and_b32_e32 v2, s10, v2
-; SI-NEXT: v_and_b32_e32 v3, s10, v3
-; SI-NEXT: v_and_b32_e32 v6, s3, v6
-; SI-NEXT: v_and_b32_e32 v7, s3, v7
-; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
-; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2
-; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4
-; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 4
-; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_or_b32_e32 v3, v3, v1
-; SI-NEXT: v_or_b32_e32 v6, v6, v4
-; SI-NEXT: v_or_b32_e32 v7, v7, v5
-; SI-NEXT: s_mov_b32 s12, 0xaaaaaaaa
-; SI-NEXT: v_and_b32_e32 v0, s11, v2
-; SI-NEXT: v_and_b32_e32 v1, s11, v3
-; SI-NEXT: v_and_b32_e32 v4, s9, v6
-; SI-NEXT: v_and_b32_e32 v5, s9, v7
-; SI-NEXT: v_and_b32_e32 v2, s12, v2
-; SI-NEXT: v_and_b32_e32 v3, s12, v3
-; SI-NEXT: v_and_b32_e32 v6, s10, v6
-; SI-NEXT: v_and_b32_e32 v7, s10, v7
-; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
-; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1
-; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 2
-; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 2
-; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_or_b32_e32 v0, v6, v4
-; SI-NEXT: v_or_b32_e32 v7, v7, v5
-; SI-NEXT: v_and_b32_e32 v5, s11, v7
-; SI-NEXT: v_and_b32_e32 v4, s11, v0
-; SI-NEXT: v_and_b32_e32 v6, s12, v0
-; SI-NEXT: v_and_b32_e32 v7, s12, v7
-; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
-; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 1
-; SI-NEXT: v_or_b32_e32 v3, v3, v1
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_or_b32_e32 v0, v6, v4
-; SI-NEXT: v_or_b32_e32 v1, v7, v5
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_brev_b64 s[2:3], s[2:3]
+; SI-NEXT: s_brev_b64 s[0:1], s[0:1]
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_mov_b32_e32 v2, s2
+; SI-NEXT: v_mov_b32_e32 v3, s3
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -519,70 +454,33 @@ define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; FLAT: ; %bb.0:
; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
-; FLAT-NEXT: v_mov_b32_e32 v4, 0x10203
-; FLAT-NEXT: s_mov_b32 s8, 0xf0f0f0f
-; FLAT-NEXT: s_mov_b32 s9, 0xcccccccc
-; FLAT-NEXT: s_mov_b32 s10, 0x55555555
-; FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; FLAT-NEXT: v_perm_b32 v3, 0, s2, v4
-; FLAT-NEXT: v_perm_b32 v2, 0, s3, v4
-; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0
-; FLAT-NEXT: v_and_b32_e32 v0, s8, v2
-; FLAT-NEXT: v_and_b32_e32 v1, s8, v3
-; FLAT-NEXT: v_and_b32_e32 v2, s2, v2
-; FLAT-NEXT: v_and_b32_e32 v3, s2, v3
-; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1]
-; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3]
-; FLAT-NEXT: v_perm_b32 v7, 0, s0, v4
-; FLAT-NEXT: v_perm_b32 v6, 0, s1, v4
-; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
-; FLAT-NEXT: s_mov_b32 s3, 0x33333333
-; FLAT-NEXT: v_or_b32_e32 v3, v3, v1
-; FLAT-NEXT: v_and_b32_e32 v0, s3, v2
-; FLAT-NEXT: v_and_b32_e32 v1, s3, v3
-; FLAT-NEXT: v_and_b32_e32 v4, s8, v6
-; FLAT-NEXT: v_and_b32_e32 v5, s8, v7
-; FLAT-NEXT: v_and_b32_e32 v2, s9, v2
-; FLAT-NEXT: v_and_b32_e32 v3, s9, v3
-; FLAT-NEXT: v_and_b32_e32 v6, s2, v6
-; FLAT-NEXT: v_and_b32_e32 v7, s2, v7
-; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3]
-; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5]
-; FLAT-NEXT: v_lshrrev_b64 v[6:7], 4, v[6:7]
-; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
-; FLAT-NEXT: v_or_b32_e32 v3, v3, v1
-; FLAT-NEXT: v_or_b32_e32 v6, v6, v4
-; FLAT-NEXT: v_or_b32_e32 v7, v7, v5
-; FLAT-NEXT: s_mov_b32 s11, 0xaaaaaaaa
-; FLAT-NEXT: v_and_b32_e32 v0, s10, v2
-; FLAT-NEXT: v_and_b32_e32 v1, s10, v3
-; FLAT-NEXT: v_and_b32_e32 v4, s3, v6
-; FLAT-NEXT: v_and_b32_e32 v5, s3, v7
-; FLAT-NEXT: v_and_b32_e32 v2, s11, v2
-; FLAT-NEXT: v_and_b32_e32 v3, s11, v3
-; FLAT-NEXT: v_and_b32_e32 v6, s9, v6
-; FLAT-NEXT: v_and_b32_e32 v7, s9, v7
-; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
-; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
-; FLAT-NEXT: v_lshrrev_b64 v[6:7], 2, v[6:7]
-; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
-; FLAT-NEXT: v_or_b32_e32 v0, v6, v4
-; FLAT-NEXT: v_or_b32_e32 v7, v7, v5
-; FLAT-NEXT: v_and_b32_e32 v5, s10, v7
-; FLAT-NEXT: v_and_b32_e32 v4, s10, v0
-; FLAT-NEXT: v_and_b32_e32 v6, s11, v0
-; FLAT-NEXT: v_and_b32_e32 v7, s11, v7
-; FLAT-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5]
-; FLAT-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7]
-; FLAT-NEXT: v_or_b32_e32 v3, v3, v1
; FLAT-NEXT: s_mov_b32 s7, 0xf000
; FLAT-NEXT: s_mov_b32 s6, -1
-; FLAT-NEXT: v_or_b32_e32 v0, v6, v4
-; FLAT-NEXT: v_or_b32_e32 v1, v7, v5
+; FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; FLAT-NEXT: s_brev_b64 s[2:3], s[2:3]
+; FLAT-NEXT: s_brev_b64 s[0:1], s[0:1]
+; FLAT-NEXT: v_mov_b32_e32 v0, s0
+; FLAT-NEXT: v_mov_b32_e32 v1, s1
+; FLAT-NEXT: v_mov_b32_e32 v2, s2
+; FLAT-NEXT: v_mov_b32_e32 v3, s3
; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; FLAT-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_brev_v2i64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v4, s4
+; GISEL-NEXT: s_brev_b64 s[0:1], s[0:1]
+; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GISEL-NEXT: v_mov_b32_e32 v5, s5
+; GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GISEL-NEXT: s_endpgm
%brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
ret void
@@ -600,76 +498,13 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
-; SI-NEXT: s_mov_b32 s0, 0xff00ff
-; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f
-; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0
-; SI-NEXT: s_mov_b32 s3, 0x33333333
-; SI-NEXT: s_mov_b32 s8, 0xcccccccc
-; SI-NEXT: s_mov_b32 s9, 0x55555555
-; SI-NEXT: s_mov_b32 s10, 0xaaaaaaaa
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_alignbit_b32 v4, v2, v2, 8
-; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24
-; SI-NEXT: v_alignbit_b32 v5, v3, v3, 8
-; SI-NEXT: v_alignbit_b32 v6, v0, v0, 8
-; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24
-; SI-NEXT: v_alignbit_b32 v7, v1, v1, 8
-; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
-; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24
-; SI-NEXT: v_bfi_b32 v2, s0, v2, v4
-; SI-NEXT: v_bfi_b32 v4, s0, v3, v5
-; SI-NEXT: v_bfi_b32 v6, s0, v0, v6
-; SI-NEXT: v_bfi_b32 v8, s0, v1, v7
-; SI-NEXT: v_and_b32_e32 v1, s1, v2
-; SI-NEXT: v_and_b32_e32 v0, s1, v4
-; SI-NEXT: v_and_b32_e32 v3, s2, v2
-; SI-NEXT: v_and_b32_e32 v2, s2, v4
-; SI-NEXT: v_and_b32_e32 v5, s1, v6
-; SI-NEXT: v_and_b32_e32 v4, s1, v8
-; SI-NEXT: v_and_b32_e32 v7, s2, v6
-; SI-NEXT: v_and_b32_e32 v6, s2, v8
-; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4
-; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4
-; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4
-; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 4
-; SI-NEXT: v_or_b32_e32 v3, v3, v1
-; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_or_b32_e32 v7, v7, v5
-; SI-NEXT: v_or_b32_e32 v6, v6, v4
-; SI-NEXT: v_and_b32_e32 v1, s3, v3
-; SI-NEXT: v_and_b32_e32 v0, s3, v2
-; SI-NEXT: v_and_b32_e32 v5, s3, v7
-; SI-NEXT: v_and_b32_e32 v4, s3, v6
-; SI-NEXT: v_and_b32_e32 v3, s8, v3
-; SI-NEXT: v_and_b32_e32 v2, s8, v2
-; SI-NEXT: v_and_b32_e32 v7, s8, v7
-; SI-NEXT: v_and_b32_e32 v6, s8, v6
-; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
-; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2
-; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 2
-; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 2
-; SI-NEXT: v_or_b32_e32 v3, v3, v1
-; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_or_b32_e32 v7, v7, v5
-; SI-NEXT: v_or_b32_e32 v6, v6, v4
-; SI-NEXT: v_and_b32_e32 v1, s9, v3
-; SI-NEXT: v_and_b32_e32 v0, s9, v2
-; SI-NEXT: v_and_b32_e32 v5, s9, v7
-; SI-NEXT: v_and_b32_e32 v4, s9, v6
-; SI-NEXT: v_and_b32_e32 v3, s10, v3
-; SI-NEXT: v_and_b32_e32 v2, s10, v2
-; SI-NEXT: v_and_b32_e32 v7, s10, v7
-; SI-NEXT: v_and_b32_e32 v6, s10, v6
-; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
-; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1
-; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
-; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 1
-; SI-NEXT: v_or_b32_e32 v3, v3, v1
-; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_or_b32_e32 v1, v7, v5
-; SI-NEXT: v_or_b32_e32 v0, v6, v4
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT: v_bfrev_b32_e32 v4, v2
+; SI-NEXT: v_bfrev_b32_e32 v3, v3
+; SI-NEXT: v_bfrev_b32_e32 v2, v0
+; SI-NEXT: v_bfrev_b32_e32 v1, v1
+; SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; FLAT-LABEL: v_brev_v2i64:
@@ -677,75 +512,41 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0
-; FLAT-NEXT: s_mov_b32 s3, 0x33333333
-; FLAT-NEXT: s_mov_b32 s8, 0xcccccccc
+; FLAT-NEXT: s_mov_b32 s7, 0xf000
+; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
; FLAT-NEXT: v_mov_b32_e32 v1, s1
; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; FLAT-NEXT: s_mov_b32 s0, 0x10203
-; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f
-; FLAT-NEXT: s_mov_b32 s9, 0x55555555
-; FLAT-NEXT: s_mov_b32 s10, 0xaaaaaaaa
-; FLAT-NEXT: s_mov_b32 s7, 0xf000
-; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: s_waitcnt vmcnt(0)
-; FLAT-NEXT: v_perm_b32 v6, 0, v0, s0
-; FLAT-NEXT: v_perm_b32 v4, 0, v3, s0
-; FLAT-NEXT: v_perm_b32 v2, 0, v2, s0
-; FLAT-NEXT: v_perm_b32 v8, 0, v1, s0
-; FLAT-NEXT: v_and_b32_e32 v1, s1, v2
-; FLAT-NEXT: v_and_b32_e32 v0, s1, v4
-; FLAT-NEXT: v_and_b32_e32 v3, s2, v2
-; FLAT-NEXT: v_and_b32_e32 v2, s2, v4
-; FLAT-NEXT: v_and_b32_e32 v5, s1, v6
-; FLAT-NEXT: v_and_b32_e32 v4, s1, v8
-; FLAT-NEXT: v_and_b32_e32 v7, s2, v6
-; FLAT-NEXT: v_and_b32_e32 v6, s2, v8
-; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1]
-; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3]
-; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5]
-; FLAT-NEXT: v_lshrrev_b64 v[6:7], 4, v[6:7]
-; FLAT-NEXT: v_or_b32_e32 v3, v3, v1
-; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
-; FLAT-NEXT: v_or_b32_e32 v7, v7, v5
-; FLAT-NEXT: v_or_b32_e32 v6, v6, v4
-; FLAT-NEXT: v_and_b32_e32 v1, s3, v3
-; FLAT-NEXT: v_and_b32_e32 v0, s3, v2
-; FLAT-NEXT: v_and_b32_e32 v5, s3, v7
-; FLAT-NEXT: v_and_b32_e32 v4, s3, v6
-; FLAT-NEXT: v_and_b32_e32 v3, s8, v3
-; FLAT-NEXT: v_and_b32_e32 v2, s8, v2
-; FLAT-NEXT: v_and_b32_e32 v7, s8, v7
-; FLAT-NEXT: v_and_b32_e32 v6, s8, v6
-; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3]
-; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
-; FLAT-NEXT: v_lshrrev_b64 v[6:7], 2, v[6:7]
-; FLAT-NEXT: v_or_b32_e32 v3, v3, v1
-; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
-; FLAT-NEXT: v_or_b32_e32 v7, v7, v5
-; FLAT-NEXT: v_or_b32_e32 v6, v6, v4
-; FLAT-NEXT: v_and_b32_e32 v1, s9, v3
-; FLAT-NEXT: v_and_b32_e32 v0, s9, v2
-; FLAT-NEXT: v_and_b32_e32 v5, s9, v7
-; FLAT-NEXT: v_and_b32_e32 v4, s9, v6
-; FLAT-NEXT: v_and_b32_e32 v3, s10, v3
-; FLAT-NEXT: v_and_b32_e32 v2, s10, v2
-; FLAT-NEXT: v_and_b32_e32 v7, s10, v7
-; FLAT-NEXT: v_and_b32_e32 v6, s10, v6
-; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
-; FLAT-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5]
-; FLAT-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7]
-; FLAT-NEXT: v_or_b32_e32 v3, v3, v1
-; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
-; FLAT-NEXT: v_or_b32_e32 v1, v7, v5
-; FLAT-NEXT: v_or_b32_e32 v0, v6, v4
-; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; FLAT-NEXT: v_bfrev_b32_e32 v4, v2
+; FLAT-NEXT: v_bfrev_b32_e32 v3, v3
+; FLAT-NEXT: v_bfrev_b32_e32 v2, v0
+; FLAT-NEXT: v_bfrev_b32_e32 v1, v1
+; FLAT-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0
; FLAT-NEXT: s_endpgm
+;
+; GISEL-LABEL: v_brev_v2i64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_bfrev_b32_e32 v4, v1
+; GISEL-NEXT: v_bfrev_b32_e32 v5, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: v_bfrev_b32_e32 v6, v3
+; GISEL-NEXT: v_bfrev_b32_e32 v7, v2
+; GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <2 x i64> , <2 x i64> addrspace(1)* %valptr, i32 %tid
%val = load <2 x i64>, <2 x i64> addrspace(1)* %gep
@@ -769,6 +570,13 @@ define float @missing_truncate_promote_bitreverse(i32 %arg) {
; FLAT-NEXT: v_bfrev_b32_e32 v0, v0
; FLAT-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; FLAT-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: missing_truncate_promote_bitreverse:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_bfrev_b32_e32 v0, v0
+; GISEL-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-NEXT: s_setpc_b64 s[30:31]
bb:
%tmp = trunc i32 %arg to i16
%tmp1 = call i16 @llvm.bitreverse.i16(i16 %tmp)
More information about the llvm-commits
mailing list