[llvm] [AMDGPU] Precommit test for D159533 (PR #66965)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 20 17:35:23 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
<details>
<summary>Changes</summary>
Precommit test ahead of https://reviews.llvm.org/D159533 for ISD::FSHR / AMDGPUISD::PERM combine
---
Full diff: https://github.com/llvm/llvm-project/pull/66965.diff
1 Files Affected:
- (modified) llvm/test/CodeGen/AMDGPU/permute_i8.ll (+405)
``````````diff
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index c71f69edc76fa6e..1d139f1fb40c281 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -2883,6 +2883,411 @@ define hidden void @extract1347_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %i
ret void
}
+
+declare i16 @llvm.fshr.i16(i16, i16, i16)
+
+define hidden void @fshri16_8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshri16_8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off
+; GFX10-NEXT: global_load_dword v7, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshri16_8:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v6, v[0:1], off
+; GFX9-NEXT: global_load_dword v7, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0x30407
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+ %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+ %v1e0 = extractelement <2 x i16> %vec1, i64 0
+ %v1e1 = extractelement <2 x i16> %vec1, i64 1
+ %v2e0 = extractelement <2 x i16> %vec2, i64 0
+ %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+ %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 8)
+ %byte01 = zext i16 %tmp01.0 to i32
+
+ %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 8)
+ %tmp23.1 = zext i16 %tmp23.0 to i32
+ %byte23 = shl i32 %tmp23.1, 16
+ %res = or i32 %byte01, %byte23
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
+
+define hidden void @fshri16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshri16_16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off
+; GFX10-NEXT: global_load_dword v7, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3020706
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshri16_16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v6, v[0:1], off
+; GFX9-NEXT: global_load_dword v7, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+ %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+ %v1e0 = extractelement <2 x i16> %vec1, i64 0
+ %v1e1 = extractelement <2 x i16> %vec1, i64 1
+ %v2e0 = extractelement <2 x i16> %vec2, i64 0
+ %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+ %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 16)
+ %byte01 = zext i16 %tmp01.0 to i32
+
+ %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 16)
+ %tmp23.1 = zext i16 %tmp23.0 to i32
+ %byte23 = shl i32 %tmp23.1, 16
+ %res = or i32 %byte01, %byte23
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
+
+define hidden void @fshri16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshri16_24:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off
+; GFX10-NEXT: global_load_dword v7, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshri16_24:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v6, v[0:1], off
+; GFX9-NEXT: global_load_dword v7, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0x30407
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+ %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+ %v1e0 = extractelement <2 x i16> %vec1, i64 0
+ %v1e1 = extractelement <2 x i16> %vec1, i64 1
+ %v2e0 = extractelement <2 x i16> %vec2, i64 0
+ %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+ %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 24)
+ %byte01 = zext i16 %tmp01.0 to i32
+
+ %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 24)
+ %tmp23.1 = zext i16 %tmp23.0 to i32
+ %byte23 = shl i32 %tmp23.1, 16
+ %res = or i32 %byte01, %byte23
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
+
+define hidden void @fshri16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshri16_32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off
+; GFX10-NEXT: global_load_dword v7, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3020706
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshri16_32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v6, v[0:1], off
+; GFX9-NEXT: global_load_dword v7, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+ %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+ %v1e0 = extractelement <2 x i16> %vec1, i64 0
+ %v1e1 = extractelement <2 x i16> %vec1, i64 1
+ %v2e0 = extractelement <2 x i16> %vec2, i64 0
+ %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+ %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 32)
+ %byte01 = zext i16 %tmp01.0 to i32
+
+ %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 32)
+ %tmp23.1 = zext i16 %tmp23.0 to i32
+ %byte23 = shl i32 %tmp23.1, 16
+ %res = or i32 %byte01, %byte23
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
+
+define hidden void @fshri16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshri16_88:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off
+; GFX10-NEXT: global_load_dword v7, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshri16_88:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v6, v[0:1], off
+; GFX9-NEXT: global_load_dword v7, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0x30407
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+ %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+ %v1e0 = extractelement <2 x i16> %vec1, i64 0
+ %v1e1 = extractelement <2 x i16> %vec1, i64 1
+ %v2e0 = extractelement <2 x i16> %vec2, i64 0
+ %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+ %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 88)
+ %byte01 = zext i16 %tmp01.0 to i32
+
+ %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 88)
+ %tmp23.1 = zext i16 %tmp23.0 to i32
+ %byte23 = shl i32 %tmp23.1, 16
+ %res = or i32 %byte01, %byte23
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
+
+declare i16 @llvm.fshl.i16(i16, i16, i16)
+
+define hidden void @fshli16_1347(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshli16_1347:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off
+; GFX10-NEXT: global_load_dword v7, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshli16_1347:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v6, v[0:1], off
+; GFX9-NEXT: global_load_dword v7, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0x30407
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+ %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+ %v1e0 = extractelement <2 x i16> %vec1, i64 0
+ %v1e1 = extractelement <2 x i16> %vec1, i64 1
+ %v2e0 = extractelement <2 x i16> %vec2, i64 0
+ %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+ %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 8)
+ %byte01 = zext i16 %tmp01.0 to i32
+
+ %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 8)
+ %tmp23.1 = zext i16 %tmp23.0 to i32
+ %byte23 = shl i32 %tmp23.1, 16
+ %res = or i32 %byte01, %byte23
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
+
+define hidden void @fshli16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshli16_16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off
+; GFX10-NEXT: global_load_dword v7, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1000504
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshli16_16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v6, v[0:1], off
+; GFX9-NEXT: global_load_dword v7, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0x1000504
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+ %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+ %v1e0 = extractelement <2 x i16> %vec1, i64 0
+ %v1e1 = extractelement <2 x i16> %vec1, i64 1
+ %v2e0 = extractelement <2 x i16> %vec2, i64 0
+ %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+ %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 16)
+ %byte01 = zext i16 %tmp01.0 to i32
+
+ %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 16)
+ %tmp23.1 = zext i16 %tmp23.0 to i32
+ %byte23 = shl i32 %tmp23.1, 16
+ %res = or i32 %byte01, %byte23
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
+
+define hidden void @fshli16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshli16_24:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off
+; GFX10-NEXT: global_load_dword v7, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshli16_24:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v6, v[0:1], off
+; GFX9-NEXT: global_load_dword v7, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0x30407
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+ %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+ %v1e0 = extractelement <2 x i16> %vec1, i64 0
+ %v1e1 = extractelement <2 x i16> %vec1, i64 1
+ %v2e0 = extractelement <2 x i16> %vec2, i64 0
+ %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+ %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 24)
+ %byte01 = zext i16 %tmp01.0 to i32
+
+ %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 24)
+ %tmp23.1 = zext i16 %tmp23.0 to i32
+ %byte23 = shl i32 %tmp23.1, 16
+ %res = or i32 %byte01, %byte23
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
+
+define hidden void @fshli16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshli16_32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off
+; GFX10-NEXT: global_load_dword v7, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1000504
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshli16_32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v6, v[0:1], off
+; GFX9-NEXT: global_load_dword v7, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0x1000504
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+ %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+ %v1e0 = extractelement <2 x i16> %vec1, i64 0
+ %v1e1 = extractelement <2 x i16> %vec1, i64 1
+ %v2e0 = extractelement <2 x i16> %vec2, i64 0
+ %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+ %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 32)
+ %byte01 = zext i16 %tmp01.0 to i32
+
+ %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 32)
+ %tmp23.1 = zext i16 %tmp23.0 to i32
+ %byte23 = shl i32 %tmp23.1, 16
+ %res = or i32 %byte01, %byte23
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
+
+define hidden void @fshli16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshli16_88:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off
+; GFX10-NEXT: global_load_dword v7, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshli16_88:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v6, v[0:1], off
+; GFX9-NEXT: global_load_dword v7, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0x30407
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+ %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+ %v1e0 = extractelement <2 x i16> %vec1, i64 0
+ %v1e1 = extractelement <2 x i16> %vec1, i64 1
+ %v2e0 = extractelement <2 x i16> %vec2, i64 0
+ %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+ %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 88)
+ %byte01 = zext i16 %tmp01.0 to i32
+
+ %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 88)
+ %tmp23.1 = zext i16 %tmp23.0 to i32
+ %byte23 = shl i32 %tmp23.1, 16
+ %res = or i32 %byte01, %byte23
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
+
define hidden void @shlbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i32 %base) {
; GFX10-LABEL: shlbase:
; GFX10: ; %bb.0:
``````````
</details>
https://github.com/llvm/llvm-project/pull/66965
More information about the llvm-commits
mailing list