[llvm] 8991ce9 - [AMDGPU] Add basic clmul test coverage (#190205)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 2 09:41:40 PDT 2026
Author: Simon Pilgrim
Date: 2026-04-02T16:41:34Z
New Revision: 8991ce9cff7b4e1b72c19e202b7bfe3d36499aba
URL: https://github.com/llvm/llvm-project/commit/8991ce9cff7b4e1b72c19e202b7bfe3d36499aba
DIFF: https://github.com/llvm/llvm-project/commit/8991ce9cff7b4e1b72c19e202b7bfe3d36499aba.diff
LOG: [AMDGPU] Add basic clmul test coverage (#190205)
Added:
llvm/test/CodeGen/AMDGPU/clmul.ll
Modified:
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/clmul.ll b/llvm/test/CodeGen/AMDGPU/clmul.ll
new file mode 100644
index 0000000000000..03cf3da9bde19
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/clmul.ll
@@ -0,0 +1,4371 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1250 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX1250 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s
+
+define amdgpu_kernel void @test_clmul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; SI-LABEL: test_clmul_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s6
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_readfirstlane_b32 s5, v1
+; SI-NEXT: v_readfirstlane_b32 s4, v0
+; SI-NEXT: s_and_b32 s6, s5, 2
+; SI-NEXT: s_and_b32 s7, s5, 1
+; SI-NEXT: s_and_b32 s8, s5, 4
+; SI-NEXT: s_mul_i32 s6, s4, s6
+; SI-NEXT: s_mul_i32 s7, s4, s7
+; SI-NEXT: s_and_b32 s9, s5, 8
+; SI-NEXT: s_mul_i32 s8, s4, s8
+; SI-NEXT: s_xor_b32 s6, s7, s6
+; SI-NEXT: s_and_b32 s10, s5, 16
+; SI-NEXT: s_mul_i32 s9, s4, s9
+; SI-NEXT: s_xor_b32 s6, s6, s8
+; SI-NEXT: s_and_b32 s11, s5, 32
+; SI-NEXT: s_mul_i32 s10, s4, s10
+; SI-NEXT: s_xor_b32 s6, s6, s9
+; SI-NEXT: s_and_b32 s12, s5, 64
+; SI-NEXT: s_mul_i32 s11, s4, s11
+; SI-NEXT: s_xor_b32 s6, s6, s10
+; SI-NEXT: s_and_b32 s13, s5, 0x80
+; SI-NEXT: s_mul_i32 s12, s4, s12
+; SI-NEXT: s_xor_b32 s6, s6, s11
+; SI-NEXT: s_and_b32 s14, s5, 0x100
+; SI-NEXT: s_mul_i32 s13, s4, s13
+; SI-NEXT: s_xor_b32 s6, s6, s12
+; SI-NEXT: s_and_b32 s15, s5, 0x200
+; SI-NEXT: s_mul_i32 s14, s4, s14
+; SI-NEXT: s_xor_b32 s6, s6, s13
+; SI-NEXT: s_and_b32 s16, s5, 0x400
+; SI-NEXT: s_mul_i32 s15, s4, s15
+; SI-NEXT: s_xor_b32 s6, s6, s14
+; SI-NEXT: s_and_b32 s17, s5, 0x800
+; SI-NEXT: s_mul_i32 s16, s4, s16
+; SI-NEXT: s_xor_b32 s6, s6, s15
+; SI-NEXT: s_and_b32 s18, s5, 0x1000
+; SI-NEXT: s_mul_i32 s17, s4, s17
+; SI-NEXT: s_xor_b32 s6, s6, s16
+; SI-NEXT: s_and_b32 s19, s5, 0x2000
+; SI-NEXT: s_mul_i32 s18, s4, s18
+; SI-NEXT: s_xor_b32 s6, s6, s17
+; SI-NEXT: s_and_b32 s20, s5, 0x4000
+; SI-NEXT: s_mul_i32 s19, s4, s19
+; SI-NEXT: s_xor_b32 s6, s6, s18
+; SI-NEXT: s_and_b32 s21, s5, 0x8000
+; SI-NEXT: s_mul_i32 s20, s4, s20
+; SI-NEXT: s_xor_b32 s6, s6, s19
+; SI-NEXT: s_and_b32 s22, s5, 0x10000
+; SI-NEXT: s_mul_i32 s21, s4, s21
+; SI-NEXT: s_xor_b32 s6, s6, s20
+; SI-NEXT: s_and_b32 s23, s5, 0x20000
+; SI-NEXT: s_mul_i32 s22, s4, s22
+; SI-NEXT: s_xor_b32 s6, s6, s21
+; SI-NEXT: s_and_b32 s24, s5, 0x40000
+; SI-NEXT: s_mul_i32 s23, s4, s23
+; SI-NEXT: s_xor_b32 s6, s6, s22
+; SI-NEXT: s_and_b32 s25, s5, 0x80000
+; SI-NEXT: s_mul_i32 s24, s4, s24
+; SI-NEXT: s_xor_b32 s6, s6, s23
+; SI-NEXT: s_and_b32 s26, s5, 0x100000
+; SI-NEXT: s_mul_i32 s25, s4, s25
+; SI-NEXT: s_xor_b32 s6, s6, s24
+; SI-NEXT: s_and_b32 s27, s5, 0x200000
+; SI-NEXT: s_mul_i32 s26, s4, s26
+; SI-NEXT: s_xor_b32 s6, s6, s25
+; SI-NEXT: s_and_b32 s28, s5, 0x400000
+; SI-NEXT: s_mul_i32 s27, s4, s27
+; SI-NEXT: s_xor_b32 s6, s6, s26
+; SI-NEXT: s_and_b32 s29, s5, 0x800000
+; SI-NEXT: s_mul_i32 s28, s4, s28
+; SI-NEXT: s_xor_b32 s6, s6, s27
+; SI-NEXT: s_and_b32 s30, s5, 0x1000000
+; SI-NEXT: s_mul_i32 s29, s4, s29
+; SI-NEXT: s_xor_b32 s6, s6, s28
+; SI-NEXT: s_and_b32 s31, s5, 0x2000000
+; SI-NEXT: s_mul_i32 s30, s4, s30
+; SI-NEXT: s_xor_b32 s6, s6, s29
+; SI-NEXT: s_and_b32 s33, s5, 0x4000000
+; SI-NEXT: s_mul_i32 s31, s4, s31
+; SI-NEXT: s_xor_b32 s6, s6, s30
+; SI-NEXT: s_and_b32 s34, s5, 0x8000000
+; SI-NEXT: s_mul_i32 s33, s4, s33
+; SI-NEXT: s_xor_b32 s6, s6, s31
+; SI-NEXT: s_and_b32 s35, s5, 0x10000000
+; SI-NEXT: s_mul_i32 s34, s4, s34
+; SI-NEXT: s_xor_b32 s6, s6, s33
+; SI-NEXT: s_and_b32 s36, s5, 0x20000000
+; SI-NEXT: s_mul_i32 s35, s4, s35
+; SI-NEXT: s_xor_b32 s6, s6, s34
+; SI-NEXT: s_and_b32 s37, s5, 2.0
+; SI-NEXT: s_mul_i32 s36, s4, s36
+; SI-NEXT: s_xor_b32 s6, s6, s35
+; SI-NEXT: s_and_b32 s5, s5, 0x80000000
+; SI-NEXT: s_mul_i32 s37, s4, s37
+; SI-NEXT: s_xor_b32 s6, s6, s36
+; SI-NEXT: s_xor_b32 s6, s6, s37
+; SI-NEXT: s_mul_i32 s4, s4, s5
+; SI-NEXT: s_xor_b32 s4, s6, s4
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: test_clmul_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
+; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_readfirstlane_b32 s5, v1
+; VI-NEXT: v_readfirstlane_b32 s4, v0
+; VI-NEXT: s_and_b32 s6, s5, 2
+; VI-NEXT: s_and_b32 s7, s5, 1
+; VI-NEXT: s_and_b32 s8, s5, 4
+; VI-NEXT: s_mul_i32 s6, s4, s6
+; VI-NEXT: s_mul_i32 s7, s4, s7
+; VI-NEXT: s_and_b32 s9, s5, 8
+; VI-NEXT: s_mul_i32 s8, s4, s8
+; VI-NEXT: s_xor_b32 s6, s7, s6
+; VI-NEXT: s_and_b32 s10, s5, 16
+; VI-NEXT: s_mul_i32 s9, s4, s9
+; VI-NEXT: s_xor_b32 s6, s6, s8
+; VI-NEXT: s_and_b32 s11, s5, 32
+; VI-NEXT: s_mul_i32 s10, s4, s10
+; VI-NEXT: s_xor_b32 s6, s6, s9
+; VI-NEXT: s_and_b32 s12, s5, 64
+; VI-NEXT: s_mul_i32 s11, s4, s11
+; VI-NEXT: s_xor_b32 s6, s6, s10
+; VI-NEXT: s_and_b32 s13, s5, 0x80
+; VI-NEXT: s_mul_i32 s12, s4, s12
+; VI-NEXT: s_xor_b32 s6, s6, s11
+; VI-NEXT: s_and_b32 s14, s5, 0x100
+; VI-NEXT: s_mul_i32 s13, s4, s13
+; VI-NEXT: s_xor_b32 s6, s6, s12
+; VI-NEXT: s_and_b32 s15, s5, 0x200
+; VI-NEXT: s_mul_i32 s14, s4, s14
+; VI-NEXT: s_xor_b32 s6, s6, s13
+; VI-NEXT: s_and_b32 s16, s5, 0x400
+; VI-NEXT: s_mul_i32 s15, s4, s15
+; VI-NEXT: s_xor_b32 s6, s6, s14
+; VI-NEXT: s_and_b32 s17, s5, 0x800
+; VI-NEXT: s_mul_i32 s16, s4, s16
+; VI-NEXT: s_xor_b32 s6, s6, s15
+; VI-NEXT: s_and_b32 s18, s5, 0x1000
+; VI-NEXT: s_mul_i32 s17, s4, s17
+; VI-NEXT: s_xor_b32 s6, s6, s16
+; VI-NEXT: s_and_b32 s19, s5, 0x2000
+; VI-NEXT: s_mul_i32 s18, s4, s18
+; VI-NEXT: s_xor_b32 s6, s6, s17
+; VI-NEXT: s_and_b32 s20, s5, 0x4000
+; VI-NEXT: s_mul_i32 s19, s4, s19
+; VI-NEXT: s_xor_b32 s6, s6, s18
+; VI-NEXT: s_and_b32 s21, s5, 0x8000
+; VI-NEXT: s_mul_i32 s20, s4, s20
+; VI-NEXT: s_xor_b32 s6, s6, s19
+; VI-NEXT: s_and_b32 s22, s5, 0x10000
+; VI-NEXT: s_mul_i32 s21, s4, s21
+; VI-NEXT: s_xor_b32 s6, s6, s20
+; VI-NEXT: s_and_b32 s23, s5, 0x20000
+; VI-NEXT: s_mul_i32 s22, s4, s22
+; VI-NEXT: s_xor_b32 s6, s6, s21
+; VI-NEXT: s_and_b32 s24, s5, 0x40000
+; VI-NEXT: s_mul_i32 s23, s4, s23
+; VI-NEXT: s_xor_b32 s6, s6, s22
+; VI-NEXT: s_and_b32 s25, s5, 0x80000
+; VI-NEXT: s_mul_i32 s24, s4, s24
+; VI-NEXT: s_xor_b32 s6, s6, s23
+; VI-NEXT: s_and_b32 s26, s5, 0x100000
+; VI-NEXT: s_mul_i32 s25, s4, s25
+; VI-NEXT: s_xor_b32 s6, s6, s24
+; VI-NEXT: s_and_b32 s27, s5, 0x200000
+; VI-NEXT: s_mul_i32 s26, s4, s26
+; VI-NEXT: s_xor_b32 s6, s6, s25
+; VI-NEXT: s_and_b32 s28, s5, 0x400000
+; VI-NEXT: s_mul_i32 s27, s4, s27
+; VI-NEXT: s_xor_b32 s6, s6, s26
+; VI-NEXT: s_and_b32 s29, s5, 0x800000
+; VI-NEXT: s_mul_i32 s28, s4, s28
+; VI-NEXT: s_xor_b32 s6, s6, s27
+; VI-NEXT: s_and_b32 s30, s5, 0x1000000
+; VI-NEXT: s_mul_i32 s29, s4, s29
+; VI-NEXT: s_xor_b32 s6, s6, s28
+; VI-NEXT: s_and_b32 s31, s5, 0x2000000
+; VI-NEXT: s_mul_i32 s30, s4, s30
+; VI-NEXT: s_xor_b32 s6, s6, s29
+; VI-NEXT: s_and_b32 s33, s5, 0x4000000
+; VI-NEXT: s_mul_i32 s31, s4, s31
+; VI-NEXT: s_xor_b32 s6, s6, s30
+; VI-NEXT: s_and_b32 s34, s5, 0x8000000
+; VI-NEXT: s_mul_i32 s33, s4, s33
+; VI-NEXT: s_xor_b32 s6, s6, s31
+; VI-NEXT: s_and_b32 s35, s5, 0x10000000
+; VI-NEXT: s_mul_i32 s34, s4, s34
+; VI-NEXT: s_xor_b32 s6, s6, s33
+; VI-NEXT: s_and_b32 s36, s5, 0x20000000
+; VI-NEXT: s_mul_i32 s35, s4, s35
+; VI-NEXT: s_xor_b32 s6, s6, s34
+; VI-NEXT: s_and_b32 s37, s5, 2.0
+; VI-NEXT: s_mul_i32 s36, s4, s36
+; VI-NEXT: s_xor_b32 s6, s6, s35
+; VI-NEXT: s_and_b32 s5, s5, 0x80000000
+; VI-NEXT: s_mul_i32 s37, s4, s37
+; VI-NEXT: s_xor_b32 s6, s6, s36
+; VI-NEXT: s_xor_b32 s6, s6, s37
+; VI-NEXT: s_mul_i32 s4, s4, s5
+; VI-NEXT: s_xor_b32 s4, s6, s4
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_clmul_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s6, s2
+; GFX9-NEXT: s_mov_b32 s7, s3
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, s10
+; GFX9-NEXT: s_mov_b32 s5, s11
+; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: s_mov_b32 s0, s8
+; GFX9-NEXT: s_mov_b32 s1, s9
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_and_b32 s6, s5, 2
+; GFX9-NEXT: s_and_b32 s7, s5, 1
+; GFX9-NEXT: s_and_b32 s8, s5, 4
+; GFX9-NEXT: s_mul_i32 s6, s4, s6
+; GFX9-NEXT: s_mul_i32 s7, s4, s7
+; GFX9-NEXT: s_and_b32 s9, s5, 8
+; GFX9-NEXT: s_mul_i32 s8, s4, s8
+; GFX9-NEXT: s_xor_b32 s6, s7, s6
+; GFX9-NEXT: s_and_b32 s10, s5, 16
+; GFX9-NEXT: s_mul_i32 s9, s4, s9
+; GFX9-NEXT: s_xor_b32 s6, s6, s8
+; GFX9-NEXT: s_and_b32 s11, s5, 32
+; GFX9-NEXT: s_mul_i32 s10, s4, s10
+; GFX9-NEXT: s_xor_b32 s6, s6, s9
+; GFX9-NEXT: s_and_b32 s12, s5, 64
+; GFX9-NEXT: s_mul_i32 s11, s4, s11
+; GFX9-NEXT: s_xor_b32 s6, s6, s10
+; GFX9-NEXT: s_and_b32 s13, s5, 0x80
+; GFX9-NEXT: s_mul_i32 s12, s4, s12
+; GFX9-NEXT: s_xor_b32 s6, s6, s11
+; GFX9-NEXT: s_and_b32 s14, s5, 0x100
+; GFX9-NEXT: s_mul_i32 s13, s4, s13
+; GFX9-NEXT: s_xor_b32 s6, s6, s12
+; GFX9-NEXT: s_and_b32 s15, s5, 0x200
+; GFX9-NEXT: s_mul_i32 s14, s4, s14
+; GFX9-NEXT: s_xor_b32 s6, s6, s13
+; GFX9-NEXT: s_and_b32 s16, s5, 0x400
+; GFX9-NEXT: s_mul_i32 s15, s4, s15
+; GFX9-NEXT: s_xor_b32 s6, s6, s14
+; GFX9-NEXT: s_and_b32 s17, s5, 0x800
+; GFX9-NEXT: s_mul_i32 s16, s4, s16
+; GFX9-NEXT: s_xor_b32 s6, s6, s15
+; GFX9-NEXT: s_and_b32 s18, s5, 0x1000
+; GFX9-NEXT: s_mul_i32 s17, s4, s17
+; GFX9-NEXT: s_xor_b32 s6, s6, s16
+; GFX9-NEXT: s_and_b32 s19, s5, 0x2000
+; GFX9-NEXT: s_mul_i32 s18, s4, s18
+; GFX9-NEXT: s_xor_b32 s6, s6, s17
+; GFX9-NEXT: s_and_b32 s20, s5, 0x4000
+; GFX9-NEXT: s_mul_i32 s19, s4, s19
+; GFX9-NEXT: s_xor_b32 s6, s6, s18
+; GFX9-NEXT: s_and_b32 s21, s5, 0x8000
+; GFX9-NEXT: s_mul_i32 s20, s4, s20
+; GFX9-NEXT: s_xor_b32 s6, s6, s19
+; GFX9-NEXT: s_and_b32 s22, s5, 0x10000
+; GFX9-NEXT: s_mul_i32 s21, s4, s21
+; GFX9-NEXT: s_xor_b32 s6, s6, s20
+; GFX9-NEXT: s_and_b32 s23, s5, 0x20000
+; GFX9-NEXT: s_mul_i32 s22, s4, s22
+; GFX9-NEXT: s_xor_b32 s6, s6, s21
+; GFX9-NEXT: s_and_b32 s24, s5, 0x40000
+; GFX9-NEXT: s_mul_i32 s23, s4, s23
+; GFX9-NEXT: s_xor_b32 s6, s6, s22
+; GFX9-NEXT: s_and_b32 s25, s5, 0x80000
+; GFX9-NEXT: s_mul_i32 s24, s4, s24
+; GFX9-NEXT: s_xor_b32 s6, s6, s23
+; GFX9-NEXT: s_and_b32 s26, s5, 0x100000
+; GFX9-NEXT: s_mul_i32 s25, s4, s25
+; GFX9-NEXT: s_xor_b32 s6, s6, s24
+; GFX9-NEXT: s_and_b32 s27, s5, 0x200000
+; GFX9-NEXT: s_mul_i32 s26, s4, s26
+; GFX9-NEXT: s_xor_b32 s6, s6, s25
+; GFX9-NEXT: s_and_b32 s28, s5, 0x400000
+; GFX9-NEXT: s_mul_i32 s27, s4, s27
+; GFX9-NEXT: s_xor_b32 s6, s6, s26
+; GFX9-NEXT: s_and_b32 s29, s5, 0x800000
+; GFX9-NEXT: s_mul_i32 s28, s4, s28
+; GFX9-NEXT: s_xor_b32 s6, s6, s27
+; GFX9-NEXT: s_and_b32 s30, s5, 0x1000000
+; GFX9-NEXT: s_mul_i32 s29, s4, s29
+; GFX9-NEXT: s_xor_b32 s6, s6, s28
+; GFX9-NEXT: s_and_b32 s31, s5, 0x2000000
+; GFX9-NEXT: s_mul_i32 s30, s4, s30
+; GFX9-NEXT: s_xor_b32 s6, s6, s29
+; GFX9-NEXT: s_and_b32 s33, s5, 0x4000000
+; GFX9-NEXT: s_mul_i32 s31, s4, s31
+; GFX9-NEXT: s_xor_b32 s6, s6, s30
+; GFX9-NEXT: s_and_b32 s34, s5, 0x8000000
+; GFX9-NEXT: s_mul_i32 s33, s4, s33
+; GFX9-NEXT: s_xor_b32 s6, s6, s31
+; GFX9-NEXT: s_and_b32 s35, s5, 0x10000000
+; GFX9-NEXT: s_mul_i32 s34, s4, s34
+; GFX9-NEXT: s_xor_b32 s6, s6, s33
+; GFX9-NEXT: s_and_b32 s36, s5, 0x20000000
+; GFX9-NEXT: s_mul_i32 s35, s4, s35
+; GFX9-NEXT: s_xor_b32 s6, s6, s34
+; GFX9-NEXT: s_and_b32 s37, s5, 2.0
+; GFX9-NEXT: s_mul_i32 s36, s4, s36
+; GFX9-NEXT: s_xor_b32 s6, s6, s35
+; GFX9-NEXT: s_and_b32 s5, s5, 0x80000000
+; GFX9-NEXT: s_mul_i32 s37, s4, s37
+; GFX9-NEXT: s_xor_b32 s6, s6, s36
+; GFX9-NEXT: s_xor_b32 s6, s6, s37
+; GFX9-NEXT: s_mul_i32 s4, s4, s5
+; GFX9-NEXT: s_xor_b32 s4, s6, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: test_clmul_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_mov_b32 s6, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s6
+; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s8, s2
+; GFX10-NEXT: s_mov_b32 s9, s3
+; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10-NEXT: v_readfirstlane_b32 s3, v0
+; GFX10-NEXT: s_and_b32 s4, s2, 2
+; GFX10-NEXT: s_and_b32 s5, s2, 1
+; GFX10-NEXT: s_and_b32 s8, s2, 4
+; GFX10-NEXT: s_mul_i32 s4, s3, s4
+; GFX10-NEXT: s_mul_i32 s5, s3, s5
+; GFX10-NEXT: s_and_b32 s9, s2, 8
+; GFX10-NEXT: s_mul_i32 s8, s3, s8
+; GFX10-NEXT: s_xor_b32 s4, s5, s4
+; GFX10-NEXT: s_and_b32 s10, s2, 16
+; GFX10-NEXT: s_mul_i32 s5, s3, s9
+; GFX10-NEXT: s_xor_b32 s4, s4, s8
+; GFX10-NEXT: s_and_b32 s11, s2, 32
+; GFX10-NEXT: s_mul_i32 s8, s3, s10
+; GFX10-NEXT: s_xor_b32 s4, s4, s5
+; GFX10-NEXT: s_and_b32 s12, s2, 64
+; GFX10-NEXT: s_mul_i32 s5, s3, s11
+; GFX10-NEXT: s_xor_b32 s4, s4, s8
+; GFX10-NEXT: s_and_b32 s13, s2, 0x80
+; GFX10-NEXT: s_mul_i32 s8, s3, s12
+; GFX10-NEXT: s_xor_b32 s4, s4, s5
+; GFX10-NEXT: s_and_b32 s14, s2, 0x100
+; GFX10-NEXT: s_mul_i32 s5, s3, s13
+; GFX10-NEXT: s_xor_b32 s4, s4, s8
+; GFX10-NEXT: s_and_b32 s15, s2, 0x200
+; GFX10-NEXT: s_mul_i32 s8, s3, s14
+; GFX10-NEXT: s_xor_b32 s4, s4, s5
+; GFX10-NEXT: s_and_b32 s16, s2, 0x400
+; GFX10-NEXT: s_mul_i32 s5, s3, s15
+; GFX10-NEXT: s_xor_b32 s4, s4, s8
+; GFX10-NEXT: s_and_b32 s17, s2, 0x800
+; GFX10-NEXT: s_mul_i32 s8, s3, s16
+; GFX10-NEXT: s_xor_b32 s4, s4, s5
+; GFX10-NEXT: s_and_b32 s18, s2, 0x1000
+; GFX10-NEXT: s_mul_i32 s5, s3, s17
+; GFX10-NEXT: s_xor_b32 s4, s4, s8
+; GFX10-NEXT: s_and_b32 s19, s2, 0x2000
+; GFX10-NEXT: s_mul_i32 s8, s3, s18
+; GFX10-NEXT: s_xor_b32 s4, s4, s5
+; GFX10-NEXT: s_and_b32 s20, s2, 0x4000
+; GFX10-NEXT: s_mul_i32 s5, s3, s19
+; GFX10-NEXT: s_xor_b32 s4, s4, s8
+; GFX10-NEXT: s_and_b32 s21, s2, 0x8000
+; GFX10-NEXT: s_mul_i32 s8, s3, s20
+; GFX10-NEXT: s_xor_b32 s4, s4, s5
+; GFX10-NEXT: s_and_b32 s22, s2, 0x10000
+; GFX10-NEXT: s_mul_i32 s5, s3, s21
+; GFX10-NEXT: s_xor_b32 s4, s4, s8
+; GFX10-NEXT: s_and_b32 s23, s2, 0x20000
+; GFX10-NEXT: s_mul_i32 s8, s3, s22
+; GFX10-NEXT: s_xor_b32 s4, s4, s5
+; GFX10-NEXT: s_and_b32 s24, s2, 0x40000
+; GFX10-NEXT: s_mul_i32 s5, s3, s23
+; GFX10-NEXT: s_xor_b32 s4, s4, s8
+; GFX10-NEXT: s_and_b32 s25, s2, 0x80000
+; GFX10-NEXT: s_mul_i32 s8, s3, s24
+; GFX10-NEXT: s_xor_b32 s4, s4, s5
+; GFX10-NEXT: s_and_b32 s26, s2, 0x100000
+; GFX10-NEXT: s_mul_i32 s5, s3, s25
+; GFX10-NEXT: s_xor_b32 s4, s4, s8
+; GFX10-NEXT: s_and_b32 s27, s2, 0x200000
+; GFX10-NEXT: s_mul_i32 s8, s3, s26
+; GFX10-NEXT: s_xor_b32 s4, s4, s5
+; GFX10-NEXT: s_and_b32 s28, s2, 0x400000
+; GFX10-NEXT: s_mul_i32 s5, s3, s27
+; GFX10-NEXT: s_xor_b32 s4, s4, s8
+; GFX10-NEXT: s_and_b32 s29, s2, 0x800000
+; GFX10-NEXT: s_mul_i32 s8, s3, s28
+; GFX10-NEXT: s_xor_b32 s4, s4, s5
+; GFX10-NEXT: s_and_b32 s30, s2, 0x1000000
+; GFX10-NEXT: s_mul_i32 s5, s3, s29
+; GFX10-NEXT: s_xor_b32 s4, s4, s8
+; GFX10-NEXT: s_and_b32 s31, s2, 0x2000000
+; GFX10-NEXT: s_mul_i32 s8, s3, s30
+; GFX10-NEXT: s_xor_b32 s4, s4, s5
+; GFX10-NEXT: s_and_b32 s33, s2, 0x4000000
+; GFX10-NEXT: s_mul_i32 s5, s3, s31
+; GFX10-NEXT: s_xor_b32 s4, s4, s8
+; GFX10-NEXT: s_and_b32 s34, s2, 0x8000000
+; GFX10-NEXT: s_mul_i32 s8, s3, s33
+; GFX10-NEXT: s_xor_b32 s4, s4, s5
+; GFX10-NEXT: s_and_b32 s35, s2, 0x10000000
+; GFX10-NEXT: s_mul_i32 s5, s3, s34
+; GFX10-NEXT: s_xor_b32 s4, s4, s8
+; GFX10-NEXT: s_and_b32 s36, s2, 0x20000000
+; GFX10-NEXT: s_mul_i32 s8, s3, s35
+; GFX10-NEXT: s_xor_b32 s4, s4, s5
+; GFX10-NEXT: s_and_b32 s37, s2, 2.0
+; GFX10-NEXT: s_mul_i32 s5, s3, s36
+; GFX10-NEXT: s_xor_b32 s4, s4, s8
+; GFX10-NEXT: s_mul_i32 s8, s3, s37
+; GFX10-NEXT: s_xor_b32 s4, s4, s5
+; GFX10-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX10-NEXT: s_xor_b32 s4, s4, s8
+; GFX10-NEXT: s_mul_i32 s3, s3, s2
+; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_xor_b32 s2, s4, s3
+; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_clmul_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s6
+; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s2
+; GFX11-NEXT: s_mov_b32 s9, s3
+; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11-NEXT: v_readfirstlane_b32 s3, v0
+; GFX11-NEXT: s_and_b32 s4, s2, 2
+; GFX11-NEXT: s_and_b32 s5, s2, 1
+; GFX11-NEXT: s_and_b32 s8, s2, 4
+; GFX11-NEXT: s_mul_i32 s4, s3, s4
+; GFX11-NEXT: s_mul_i32 s5, s3, s5
+; GFX11-NEXT: s_and_b32 s9, s2, 8
+; GFX11-NEXT: s_mul_i32 s8, s3, s8
+; GFX11-NEXT: s_xor_b32 s4, s5, s4
+; GFX11-NEXT: s_and_b32 s10, s2, 16
+; GFX11-NEXT: s_mul_i32 s5, s3, s9
+; GFX11-NEXT: s_xor_b32 s4, s4, s8
+; GFX11-NEXT: s_and_b32 s11, s2, 32
+; GFX11-NEXT: s_mul_i32 s8, s3, s10
+; GFX11-NEXT: s_xor_b32 s4, s4, s5
+; GFX11-NEXT: s_and_b32 s12, s2, 64
+; GFX11-NEXT: s_mul_i32 s5, s3, s11
+; GFX11-NEXT: s_xor_b32 s4, s4, s8
+; GFX11-NEXT: s_and_b32 s13, s2, 0x80
+; GFX11-NEXT: s_mul_i32 s8, s3, s12
+; GFX11-NEXT: s_xor_b32 s4, s4, s5
+; GFX11-NEXT: s_and_b32 s14, s2, 0x100
+; GFX11-NEXT: s_mul_i32 s5, s3, s13
+; GFX11-NEXT: s_xor_b32 s4, s4, s8
+; GFX11-NEXT: s_and_b32 s15, s2, 0x200
+; GFX11-NEXT: s_mul_i32 s8, s3, s14
+; GFX11-NEXT: s_xor_b32 s4, s4, s5
+; GFX11-NEXT: s_and_b32 s16, s2, 0x400
+; GFX11-NEXT: s_mul_i32 s5, s3, s15
+; GFX11-NEXT: s_xor_b32 s4, s4, s8
+; GFX11-NEXT: s_and_b32 s17, s2, 0x800
+; GFX11-NEXT: s_mul_i32 s8, s3, s16
+; GFX11-NEXT: s_xor_b32 s4, s4, s5
+; GFX11-NEXT: s_and_b32 s18, s2, 0x1000
+; GFX11-NEXT: s_mul_i32 s5, s3, s17
+; GFX11-NEXT: s_xor_b32 s4, s4, s8
+; GFX11-NEXT: s_and_b32 s19, s2, 0x2000
+; GFX11-NEXT: s_mul_i32 s8, s3, s18
+; GFX11-NEXT: s_xor_b32 s4, s4, s5
+; GFX11-NEXT: s_and_b32 s20, s2, 0x4000
+; GFX11-NEXT: s_mul_i32 s5, s3, s19
+; GFX11-NEXT: s_xor_b32 s4, s4, s8
+; GFX11-NEXT: s_and_b32 s21, s2, 0x8000
+; GFX11-NEXT: s_mul_i32 s8, s3, s20
+; GFX11-NEXT: s_xor_b32 s4, s4, s5
+; GFX11-NEXT: s_and_b32 s22, s2, 0x10000
+; GFX11-NEXT: s_mul_i32 s5, s3, s21
+; GFX11-NEXT: s_xor_b32 s4, s4, s8
+; GFX11-NEXT: s_and_b32 s23, s2, 0x20000
+; GFX11-NEXT: s_mul_i32 s8, s3, s22
+; GFX11-NEXT: s_xor_b32 s4, s4, s5
+; GFX11-NEXT: s_and_b32 s24, s2, 0x40000
+; GFX11-NEXT: s_mul_i32 s5, s3, s23
+; GFX11-NEXT: s_xor_b32 s4, s4, s8
+; GFX11-NEXT: s_and_b32 s25, s2, 0x80000
+; GFX11-NEXT: s_mul_i32 s8, s3, s24
+; GFX11-NEXT: s_xor_b32 s4, s4, s5
+; GFX11-NEXT: s_and_b32 s26, s2, 0x100000
+; GFX11-NEXT: s_mul_i32 s5, s3, s25
+; GFX11-NEXT: s_xor_b32 s4, s4, s8
+; GFX11-NEXT: s_and_b32 s27, s2, 0x200000
+; GFX11-NEXT: s_mul_i32 s8, s3, s26
+; GFX11-NEXT: s_xor_b32 s4, s4, s5
+; GFX11-NEXT: s_and_b32 s28, s2, 0x400000
+; GFX11-NEXT: s_mul_i32 s5, s3, s27
+; GFX11-NEXT: s_xor_b32 s4, s4, s8
+; GFX11-NEXT: s_and_b32 s29, s2, 0x800000
+; GFX11-NEXT: s_mul_i32 s8, s3, s28
+; GFX11-NEXT: s_xor_b32 s4, s4, s5
+; GFX11-NEXT: s_and_b32 s30, s2, 0x1000000
+; GFX11-NEXT: s_mul_i32 s5, s3, s29
+; GFX11-NEXT: s_xor_b32 s4, s4, s8
+; GFX11-NEXT: s_and_b32 s31, s2, 0x2000000
+; GFX11-NEXT: s_mul_i32 s8, s3, s30
+; GFX11-NEXT: s_xor_b32 s4, s4, s5
+; GFX11-NEXT: s_and_b32 s33, s2, 0x4000000
+; GFX11-NEXT: s_mul_i32 s5, s3, s31
+; GFX11-NEXT: s_xor_b32 s4, s4, s8
+; GFX11-NEXT: s_and_b32 s34, s2, 0x8000000
+; GFX11-NEXT: s_mul_i32 s8, s3, s33
+; GFX11-NEXT: s_xor_b32 s4, s4, s5
+; GFX11-NEXT: s_and_b32 s35, s2, 0x10000000
+; GFX11-NEXT: s_mul_i32 s5, s3, s34
+; GFX11-NEXT: s_xor_b32 s4, s4, s8
+; GFX11-NEXT: s_and_b32 s36, s2, 0x20000000
+; GFX11-NEXT: s_mul_i32 s8, s3, s35
+; GFX11-NEXT: s_xor_b32 s4, s4, s5
+; GFX11-NEXT: s_and_b32 s37, s2, 2.0
+; GFX11-NEXT: s_mul_i32 s5, s3, s36
+; GFX11-NEXT: s_xor_b32 s4, s4, s8
+; GFX11-NEXT: s_mul_i32 s8, s3, s37
+; GFX11-NEXT: s_xor_b32 s4, s4, s5
+; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_xor_b32 s4, s4, s8
+; GFX11-NEXT: s_mul_i32 s3, s3, s2
+; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_xor_b32 s2, s4, s3
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_clmul_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s2
+; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12-NEXT: v_readfirstlane_b32 s3, v0
+; GFX12-NEXT: s_and_b32 s4, s2, 2
+; GFX12-NEXT: s_and_b32 s5, s2, 1
+; GFX12-NEXT: s_and_b32 s8, s2, 4
+; GFX12-NEXT: s_mul_i32 s4, s3, s4
+; GFX12-NEXT: s_mul_i32 s5, s3, s5
+; GFX12-NEXT: s_and_b32 s9, s2, 8
+; GFX12-NEXT: s_mul_i32 s8, s3, s8
+; GFX12-NEXT: s_xor_b32 s4, s5, s4
+; GFX12-NEXT: s_and_b32 s10, s2, 16
+; GFX12-NEXT: s_mul_i32 s5, s3, s9
+; GFX12-NEXT: s_xor_b32 s4, s4, s8
+; GFX12-NEXT: s_and_b32 s11, s2, 32
+; GFX12-NEXT: s_mul_i32 s8, s3, s10
+; GFX12-NEXT: s_xor_b32 s4, s4, s5
+; GFX12-NEXT: s_and_b32 s12, s2, 64
+; GFX12-NEXT: s_mul_i32 s5, s3, s11
+; GFX12-NEXT: s_xor_b32 s4, s4, s8
+; GFX12-NEXT: s_and_b32 s13, s2, 0x80
+; GFX12-NEXT: s_mul_i32 s8, s3, s12
+; GFX12-NEXT: s_xor_b32 s4, s4, s5
+; GFX12-NEXT: s_and_b32 s14, s2, 0x100
+; GFX12-NEXT: s_mul_i32 s5, s3, s13
+; GFX12-NEXT: s_xor_b32 s4, s4, s8
+; GFX12-NEXT: s_and_b32 s15, s2, 0x200
+; GFX12-NEXT: s_mul_i32 s8, s3, s14
+; GFX12-NEXT: s_xor_b32 s4, s4, s5
+; GFX12-NEXT: s_and_b32 s16, s2, 0x400
+; GFX12-NEXT: s_mul_i32 s5, s3, s15
+; GFX12-NEXT: s_xor_b32 s4, s4, s8
+; GFX12-NEXT: s_and_b32 s17, s2, 0x800
+; GFX12-NEXT: s_mul_i32 s8, s3, s16
+; GFX12-NEXT: s_xor_b32 s4, s4, s5
+; GFX12-NEXT: s_and_b32 s18, s2, 0x1000
+; GFX12-NEXT: s_mul_i32 s5, s3, s17
+; GFX12-NEXT: s_xor_b32 s4, s4, s8
+; GFX12-NEXT: s_and_b32 s19, s2, 0x2000
+; GFX12-NEXT: s_mul_i32 s8, s3, s18
+; GFX12-NEXT: s_xor_b32 s4, s4, s5
+; GFX12-NEXT: s_and_b32 s20, s2, 0x4000
+; GFX12-NEXT: s_mul_i32 s5, s3, s19
+; GFX12-NEXT: s_xor_b32 s4, s4, s8
+; GFX12-NEXT: s_and_b32 s21, s2, 0x8000
+; GFX12-NEXT: s_mul_i32 s8, s3, s20
+; GFX12-NEXT: s_xor_b32 s4, s4, s5
+; GFX12-NEXT: s_and_b32 s22, s2, 0x10000
+; GFX12-NEXT: s_mul_i32 s5, s3, s21
+; GFX12-NEXT: s_xor_b32 s4, s4, s8
+; GFX12-NEXT: s_and_b32 s23, s2, 0x20000
+; GFX12-NEXT: s_mul_i32 s8, s3, s22
+; GFX12-NEXT: s_xor_b32 s4, s4, s5
+; GFX12-NEXT: s_and_b32 s24, s2, 0x40000
+; GFX12-NEXT: s_mul_i32 s5, s3, s23
+; GFX12-NEXT: s_xor_b32 s4, s4, s8
+; GFX12-NEXT: s_and_b32 s25, s2, 0x80000
+; GFX12-NEXT: s_mul_i32 s8, s3, s24
+; GFX12-NEXT: s_xor_b32 s4, s4, s5
+; GFX12-NEXT: s_and_b32 s26, s2, 0x100000
+; GFX12-NEXT: s_mul_i32 s5, s3, s25
+; GFX12-NEXT: s_xor_b32 s4, s4, s8
+; GFX12-NEXT: s_and_b32 s27, s2, 0x200000
+; GFX12-NEXT: s_mul_i32 s8, s3, s26
+; GFX12-NEXT: s_xor_b32 s4, s4, s5
+; GFX12-NEXT: s_and_b32 s28, s2, 0x400000
+; GFX12-NEXT: s_mul_i32 s5, s3, s27
+; GFX12-NEXT: s_xor_b32 s4, s4, s8
+; GFX12-NEXT: s_and_b32 s29, s2, 0x800000
+; GFX12-NEXT: s_mul_i32 s8, s3, s28
+; GFX12-NEXT: s_xor_b32 s4, s4, s5
+; GFX12-NEXT: s_and_b32 s30, s2, 0x1000000
+; GFX12-NEXT: s_mul_i32 s5, s3, s29
+; GFX12-NEXT: s_xor_b32 s4, s4, s8
+; GFX12-NEXT: s_and_b32 s31, s2, 0x2000000
+; GFX12-NEXT: s_mul_i32 s8, s3, s30
+; GFX12-NEXT: s_xor_b32 s4, s4, s5
+; GFX12-NEXT: s_and_b32 s33, s2, 0x4000000
+; GFX12-NEXT: s_mul_i32 s5, s3, s31
+; GFX12-NEXT: s_xor_b32 s4, s4, s8
+; GFX12-NEXT: s_and_b32 s34, s2, 0x8000000
+; GFX12-NEXT: s_mul_i32 s8, s3, s33
+; GFX12-NEXT: s_xor_b32 s4, s4, s5
+; GFX12-NEXT: s_and_b32 s35, s2, 0x10000000
+; GFX12-NEXT: s_mul_i32 s5, s3, s34
+; GFX12-NEXT: s_xor_b32 s4, s4, s8
+; GFX12-NEXT: s_and_b32 s36, s2, 0x20000000
+; GFX12-NEXT: s_mul_i32 s8, s3, s35
+; GFX12-NEXT: s_xor_b32 s4, s4, s5
+; GFX12-NEXT: s_and_b32 s37, s2, 2.0
+; GFX12-NEXT: s_mul_i32 s5, s3, s36
+; GFX12-NEXT: s_xor_b32 s4, s4, s8
+; GFX12-NEXT: s_mul_i32 s8, s3, s37
+; GFX12-NEXT: s_xor_b32 s4, s4, s5
+; GFX12-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX12-NEXT: s_xor_b32 s4, s4, s8
+; GFX12-NEXT: s_mul_i32 s3, s3, s2
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_xor_b32 s2, s4, s3
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: test_clmul_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s2
+; GFX1250-NEXT: s_mov_b32 s9, s3
+; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1250-NEXT: s_and_b32 s4, s2, 2
+; GFX1250-NEXT: s_and_b32 s5, s2, 1
+; GFX1250-NEXT: s_and_b32 s8, s2, 4
+; GFX1250-NEXT: s_mul_i32 s4, s3, s4
+; GFX1250-NEXT: s_mul_i32 s5, s3, s5
+; GFX1250-NEXT: s_and_b32 s9, s2, 8
+; GFX1250-NEXT: s_mul_i32 s8, s3, s8
+; GFX1250-NEXT: s_xor_b32 s4, s5, s4
+; GFX1250-NEXT: s_and_b32 s10, s2, 16
+; GFX1250-NEXT: s_mul_i32 s5, s3, s9
+; GFX1250-NEXT: s_xor_b32 s4, s4, s8
+; GFX1250-NEXT: s_and_b32 s11, s2, 32
+; GFX1250-NEXT: s_mul_i32 s8, s3, s10
+; GFX1250-NEXT: s_xor_b32 s4, s4, s5
+; GFX1250-NEXT: s_and_b32 s12, s2, 64
+; GFX1250-NEXT: s_mul_i32 s5, s3, s11
+; GFX1250-NEXT: s_xor_b32 s4, s4, s8
+; GFX1250-NEXT: s_and_b32 s13, s2, 0x80
+; GFX1250-NEXT: s_mul_i32 s8, s3, s12
+; GFX1250-NEXT: s_xor_b32 s4, s4, s5
+; GFX1250-NEXT: s_and_b32 s14, s2, 0x100
+; GFX1250-NEXT: s_mul_i32 s5, s3, s13
+; GFX1250-NEXT: s_xor_b32 s4, s4, s8
+; GFX1250-NEXT: s_and_b32 s15, s2, 0x200
+; GFX1250-NEXT: s_mul_i32 s8, s3, s14
+; GFX1250-NEXT: s_xor_b32 s4, s4, s5
+; GFX1250-NEXT: s_and_b32 s16, s2, 0x400
+; GFX1250-NEXT: s_mul_i32 s5, s3, s15
+; GFX1250-NEXT: s_xor_b32 s4, s4, s8
+; GFX1250-NEXT: s_and_b32 s17, s2, 0x800
+; GFX1250-NEXT: s_mul_i32 s8, s3, s16
+; GFX1250-NEXT: s_xor_b32 s4, s4, s5
+; GFX1250-NEXT: s_and_b32 s18, s2, 0x1000
+; GFX1250-NEXT: s_mul_i32 s5, s3, s17
+; GFX1250-NEXT: s_xor_b32 s4, s4, s8
+; GFX1250-NEXT: s_and_b32 s19, s2, 0x2000
+; GFX1250-NEXT: s_mul_i32 s8, s3, s18
+; GFX1250-NEXT: s_xor_b32 s4, s4, s5
+; GFX1250-NEXT: s_and_b32 s20, s2, 0x4000
+; GFX1250-NEXT: s_mul_i32 s5, s3, s19
+; GFX1250-NEXT: s_xor_b32 s4, s4, s8
+; GFX1250-NEXT: s_and_b32 s21, s2, 0x8000
+; GFX1250-NEXT: s_mul_i32 s8, s3, s20
+; GFX1250-NEXT: s_xor_b32 s4, s4, s5
+; GFX1250-NEXT: s_and_b32 s22, s2, 0x10000
+; GFX1250-NEXT: s_mul_i32 s5, s3, s21
+; GFX1250-NEXT: s_xor_b32 s4, s4, s8
+; GFX1250-NEXT: s_and_b32 s23, s2, 0x20000
+; GFX1250-NEXT: s_mul_i32 s8, s3, s22
+; GFX1250-NEXT: s_xor_b32 s4, s4, s5
+; GFX1250-NEXT: s_and_b32 s24, s2, 0x40000
+; GFX1250-NEXT: s_mul_i32 s5, s3, s23
+; GFX1250-NEXT: s_xor_b32 s4, s4, s8
+; GFX1250-NEXT: s_and_b32 s25, s2, 0x80000
+; GFX1250-NEXT: s_mul_i32 s8, s3, s24
+; GFX1250-NEXT: s_xor_b32 s4, s4, s5
+; GFX1250-NEXT: s_and_b32 s26, s2, 0x100000
+; GFX1250-NEXT: s_mul_i32 s5, s3, s25
+; GFX1250-NEXT: s_xor_b32 s4, s4, s8
+; GFX1250-NEXT: s_and_b32 s27, s2, 0x200000
+; GFX1250-NEXT: s_mul_i32 s8, s3, s26
+; GFX1250-NEXT: s_xor_b32 s4, s4, s5
+; GFX1250-NEXT: s_and_b32 s28, s2, 0x400000
+; GFX1250-NEXT: s_mul_i32 s5, s3, s27
+; GFX1250-NEXT: s_xor_b32 s4, s4, s8
+; GFX1250-NEXT: s_and_b32 s29, s2, 0x800000
+; GFX1250-NEXT: s_mul_i32 s8, s3, s28
+; GFX1250-NEXT: s_xor_b32 s4, s4, s5
+; GFX1250-NEXT: s_and_b32 s30, s2, 0x1000000
+; GFX1250-NEXT: s_mul_i32 s5, s3, s29
+; GFX1250-NEXT: s_xor_b32 s4, s4, s8
+; GFX1250-NEXT: s_and_b32 s31, s2, 0x2000000
+; GFX1250-NEXT: s_mul_i32 s8, s3, s30
+; GFX1250-NEXT: s_xor_b32 s4, s4, s5
+; GFX1250-NEXT: s_and_b32 s33, s2, 0x4000000
+; GFX1250-NEXT: s_mul_i32 s5, s3, s31
+; GFX1250-NEXT: s_xor_b32 s4, s4, s8
+; GFX1250-NEXT: s_and_b32 s34, s2, 0x8000000
+; GFX1250-NEXT: s_mul_i32 s8, s3, s33
+; GFX1250-NEXT: s_xor_b32 s4, s4, s5
+; GFX1250-NEXT: s_and_b32 s35, s2, 0x10000000
+; GFX1250-NEXT: s_mul_i32 s5, s3, s34
+; GFX1250-NEXT: s_xor_b32 s4, s4, s8
+; GFX1250-NEXT: s_and_b32 s36, s2, 0x20000000
+; GFX1250-NEXT: s_mul_i32 s8, s3, s35
+; GFX1250-NEXT: s_xor_b32 s4, s4, s5
+; GFX1250-NEXT: s_and_b32 s37, s2, 2.0
+; GFX1250-NEXT: s_mul_i32 s5, s3, s36
+; GFX1250-NEXT: s_xor_b32 s4, s4, s8
+; GFX1250-NEXT: s_mul_i32 s8, s3, s37
+; GFX1250-NEXT: s_xor_b32 s4, s4, s5
+; GFX1250-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX1250-NEXT: s_xor_b32 s4, s4, s8
+; GFX1250-NEXT: s_mul_i32 s3, s3, s2
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_xor_b32 s2, s4, s3
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX1250-NEXT: s_endpgm
+;
+; EG-LABEL: test_clmul_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 116, @9, KC0[], KC1[]
+; EG-NEXT: ALU 10, @126, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: AND_INT T0.W, T0.Y, 1,
+; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.W,
+; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT: MULLO_INT * T1.X, T0.X, T1.W,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T1.W, T0.Z, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.W,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 256(3.587324e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 512(7.174648e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 1024(1.434930e-42), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 2048(2.869859e-42), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 4096(5.739719e-42), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 8192(1.147944e-41), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 16384(2.295887e-41), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 32768(4.591775e-41), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 65536(9.183550e-41), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 131072(1.836710e-40), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 262144(3.673420e-40), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 524288(7.346840e-40), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 1048576(1.469368e-39), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 2097152(2.938736e-39), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 4194304(5.877472e-39), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 8388608(1.175494e-38), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 16777216(2.350989e-38), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 33554432(9.403955e-38), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 67108864(1.504633e-36), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 134217728(3.851860e-34), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 268435456(2.524355e-29), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z,
+; EG-NEXT: 536870912(1.084202e-19), 0(0.000000e+00)
+; EG-NEXT: AND_INT * T2.Z, T0.Y, literal.x,
+; EG-NEXT: 1073741824(2.000000e+00), 0(0.000000e+00)
+; EG-NEXT: ALU clause starting at 126:
+; EG-NEXT: XOR_INT T0.W, T0.W, T0.Z, BS:VEC_021/SCL_122
+; EG-NEXT: MULLO_INT * T0.Z, T0.X, T1.Z,
+; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.Y, T0.X, T2.Z,
+; EG-NEXT: -2147483648(-0.000000e+00), 0(0.000000e+00)
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.X, PV.Z,
+; EG-NEXT: XOR_INT T0.X, PV.W, PS,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+ %a = load i32, ptr addrspace(1) %in
+ %b = load i32, ptr addrspace(1) %b_ptr
+ %res = call i32 @llvm.clmul.i32(i32 %a, i32 %b)
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_clmulr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; SI-LABEL: test_clmulr_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s6
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s7, 0
+; SI-NEXT: s_mov_b32 s21, s7
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: s_mov_b32 s15, s7
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: s_mov_b32 s17, s7
+; SI-NEXT: s_mov_b32 s19, s7
+; SI-NEXT: s_mov_b32 s23, s7
+; SI-NEXT: s_mov_b32 s25, s7
+; SI-NEXT: s_mov_b32 s27, s7
+; SI-NEXT: s_mov_b32 s29, s7
+; SI-NEXT: s_mov_b32 s31, s7
+; SI-NEXT: s_mov_b32 s35, s7
+; SI-NEXT: s_mov_b32 s37, s7
+; SI-NEXT: s_mov_b32 s39, s7
+; SI-NEXT: s_mov_b32 s41, s7
+; SI-NEXT: s_mov_b32 s43, s7
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_readfirstlane_b32 s33, v1
+; SI-NEXT: s_and_b32 s20, s33, 2
+; SI-NEXT: v_readfirstlane_b32 s6, v0
+; SI-NEXT: s_bfe_i32 s8, s33, 0x10000
+; SI-NEXT: v_cmp_eq_u64_e64 s[20:21], s[20:21], 0
+; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 1
+; SI-NEXT: s_and_b32 s8, s8, s6
+; SI-NEXT: s_and_b64 s[20:21], s[20:21], exec
+; SI-NEXT: s_cselect_b32 s21, 0, s5
+; SI-NEXT: s_cselect_b32 s20, 0, s4
+; SI-NEXT: s_and_b32 s14, s33, 4
+; SI-NEXT: s_xor_b64 s[20:21], s[8:9], s[20:21]
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[14:15], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 2
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s15, 0, s15
+; SI-NEXT: s_cselect_b32 s14, 0, s14
+; SI-NEXT: s_and_b32 s10, s33, 8
+; SI-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], 0
+; SI-NEXT: s_xor_b64 s[14:15], s[20:21], s[14:15]
+; SI-NEXT: s_lshl_b64 s[20:21], s[6:7], 3
+; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec
+; SI-NEXT: s_cselect_b32 s11, 0, s21
+; SI-NEXT: s_cselect_b32 s10, 0, s20
+; SI-NEXT: s_and_b32 s12, s33, 16
+; SI-NEXT: v_cmp_eq_u64_e64 s[12:13], s[12:13], 0
+; SI-NEXT: s_lshl_b64 s[20:21], s[6:7], 4
+; SI-NEXT: s_xor_b64 s[10:11], s[14:15], s[10:11]
+; SI-NEXT: s_and_b64 s[12:13], s[12:13], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s21
+; SI-NEXT: s_cselect_b32 s12, 0, s20
+; SI-NEXT: s_and_b32 s16, s33, 32
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[16:17], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 5
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s18, s33, 64
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[18:19], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 6
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s22, s33, 0x80
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[22:23], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 7
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s24, s33, 0x100
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[24:25], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 8
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s26, s33, 0x200
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[26:27], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 9
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s28, s33, 0x400
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[28:29], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 10
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s30, s33, 0x800
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[30:31], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 11
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s34, s33, 0x1000
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[34:35], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 12
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s36, s33, 0x2000
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[36:37], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 13
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s38, s33, 0x4000
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[38:39], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 14
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s40, s33, 0x8000
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[40:41], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 15
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s42, s33, 0x10000
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[42:43], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 16
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s4, s33, 0x20000
+; SI-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 17
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: s_cselect_b32 s5, 0, s15
+; SI-NEXT: s_cselect_b32 s4, 0, s14
+; SI-NEXT: s_and_b32 s8, s33, 0x40000
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[12:13], s[6:7], 18
+; SI-NEXT: s_xor_b64 s[4:5], s[10:11], s[4:5]
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s13
+; SI-NEXT: s_cselect_b32 s8, 0, s12
+; SI-NEXT: s_and_b32 s10, s33, 0x80000
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[10:11], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 19
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x100000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 20
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x200000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 21
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x400000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 22
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x800000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 23
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x1000000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 24
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x2000000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 25
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x4000000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 26
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x8000000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 27
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x10000000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 28
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x20000000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 29
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 2.0
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 30
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 31
+; SI-NEXT: s_cmp_gt_i32 s33, -1
+; SI-NEXT: s_cselect_b32 s7, 0, s7
+; SI-NEXT: s_cselect_b32 s6, 0, s6
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
+; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 31
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: test_clmulr_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
+; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s7, 0
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_mov_b32 s9, s7
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s7
+; VI-NEXT: s_mov_b32 s17, s7
+; VI-NEXT: s_mov_b32 s19, s7
+; VI-NEXT: s_mov_b32 s21, s7
+; VI-NEXT: s_mov_b32 s23, s7
+; VI-NEXT: s_mov_b32 s25, s7
+; VI-NEXT: s_mov_b32 s27, s7
+; VI-NEXT: s_mov_b32 s29, s7
+; VI-NEXT: s_mov_b32 s31, s7
+; VI-NEXT: s_mov_b32 s35, s7
+; VI-NEXT: s_mov_b32 s37, s7
+; VI-NEXT: s_mov_b32 s39, s7
+; VI-NEXT: s_mov_b32 s41, s7
+; VI-NEXT: s_mov_b32 s43, s7
+; VI-NEXT: s_mov_b32 s45, s7
+; VI-NEXT: s_mov_b32 s47, s7
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_readfirstlane_b32 s4, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v0
+; VI-NEXT: s_bfe_i32 s5, s4, 0x10000
+; VI-NEXT: s_lshl_b64 s[48:49], s[6:7], 1
+; VI-NEXT: s_and_b32 s10, s4, 2
+; VI-NEXT: s_and_b32 s8, s5, s6
+; VI-NEXT: s_cmp_eq_u64 s[10:11], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s49
+; VI-NEXT: s_cselect_b32 s10, 0, s48
+; VI-NEXT: s_lshl_b64 s[48:49], s[6:7], 2
+; VI-NEXT: s_and_b32 s12, s4, 4
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s49
+; VI-NEXT: s_cselect_b32 s10, 0, s48
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 3
+; VI-NEXT: s_and_b32 s14, s4, 8
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[14:15], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 4
+; VI-NEXT: s_and_b32 s16, s4, 16
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[16:17], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 5
+; VI-NEXT: s_and_b32 s18, s4, 32
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[18:19], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 6
+; VI-NEXT: s_and_b32 s20, s4, 64
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[20:21], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 7
+; VI-NEXT: s_and_b32 s22, s4, 0x80
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[22:23], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 8
+; VI-NEXT: s_and_b32 s24, s4, 0x100
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[24:25], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 9
+; VI-NEXT: s_and_b32 s26, s4, 0x200
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[26:27], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 10
+; VI-NEXT: s_and_b32 s28, s4, 0x400
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[28:29], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 11
+; VI-NEXT: s_and_b32 s30, s4, 0x800
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[30:31], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 12
+; VI-NEXT: s_and_b32 s34, s4, 0x1000
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[34:35], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 13
+; VI-NEXT: s_and_b32 s36, s4, 0x2000
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[36:37], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 14
+; VI-NEXT: s_and_b32 s38, s4, 0x4000
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[38:39], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 15
+; VI-NEXT: s_and_b32 s40, s4, 0x8000
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[40:41], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 16
+; VI-NEXT: s_and_b32 s42, s4, 0x10000
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[42:43], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 17
+; VI-NEXT: s_and_b32 s44, s4, 0x20000
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[44:45], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 18
+; VI-NEXT: s_and_b32 s46, s4, 0x40000
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[46:47], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 19
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_and_b32 s10, s4, 0x80000
+; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_cmp_eq_u64 s[10:11], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 20
+; VI-NEXT: s_and_b32 s12, s4, 0x100000
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 21
+; VI-NEXT: s_and_b32 s12, s4, 0x200000
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 22
+; VI-NEXT: s_and_b32 s12, s4, 0x400000
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 23
+; VI-NEXT: s_and_b32 s12, s4, 0x800000
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 24
+; VI-NEXT: s_and_b32 s12, s4, 0x1000000
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 25
+; VI-NEXT: s_and_b32 s12, s4, 0x2000000
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 26
+; VI-NEXT: s_and_b32 s12, s4, 0x4000000
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 27
+; VI-NEXT: s_and_b32 s12, s4, 0x8000000
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 28
+; VI-NEXT: s_and_b32 s12, s4, 0x10000000
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 29
+; VI-NEXT: s_and_b32 s12, s4, 0x20000000
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 30
+; VI-NEXT: s_and_b32 s12, s4, 2.0
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 31
+; VI-NEXT: s_cmp_gt_i32 s4, -1
+; VI-NEXT: s_cselect_b32 s5, 0, s7
+; VI-NEXT: s_cselect_b32 s4, 0, s6
+; VI-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5]
+; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 31
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_clmulr_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s6, s2
+; GFX9-NEXT: s_mov_b32 s7, s3
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, s10
+; GFX9-NEXT: s_mov_b32 s5, s11
+; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: s_mov_b32 s0, s8
+; GFX9-NEXT: s_mov_b32 s5, 0
+; GFX9-NEXT: s_mov_b32 s11, s5
+; GFX9-NEXT: s_mov_b32 s7, s5
+; GFX9-NEXT: s_mov_b32 s13, s5
+; GFX9-NEXT: s_mov_b32 s15, s5
+; GFX9-NEXT: s_mov_b32 s17, s5
+; GFX9-NEXT: s_mov_b32 s19, s5
+; GFX9-NEXT: s_mov_b32 s21, s5
+; GFX9-NEXT: s_mov_b32 s23, s5
+; GFX9-NEXT: s_mov_b32 s25, s5
+; GFX9-NEXT: s_mov_b32 s27, s5
+; GFX9-NEXT: s_mov_b32 s29, s5
+; GFX9-NEXT: s_mov_b32 s31, s5
+; GFX9-NEXT: s_mov_b32 s35, s5
+; GFX9-NEXT: s_mov_b32 s37, s5
+; GFX9-NEXT: s_mov_b32 s39, s5
+; GFX9-NEXT: s_mov_b32 s41, s5
+; GFX9-NEXT: s_mov_b32 s43, s5
+; GFX9-NEXT: s_mov_b32 s45, s5
+; GFX9-NEXT: s_mov_b32 s47, s5
+; GFX9-NEXT: s_mov_b32 s1, s9
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_bfe_i32 s6, s8, 0x10000
+; GFX9-NEXT: s_lshl_b64 s[48:49], s[4:5], 1
+; GFX9-NEXT: s_and_b32 s10, s8, 2
+; GFX9-NEXT: s_and_b32 s6, s6, s4
+; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s49
+; GFX9-NEXT: s_cselect_b32 s10, 0, s48
+; GFX9-NEXT: s_lshl_b64 s[48:49], s[4:5], 2
+; GFX9-NEXT: s_and_b32 s12, s8, 4
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s49
+; GFX9-NEXT: s_cselect_b32 s10, 0, s48
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 3
+; GFX9-NEXT: s_and_b32 s14, s8, 8
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[14:15], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 4
+; GFX9-NEXT: s_and_b32 s16, s8, 16
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 5
+; GFX9-NEXT: s_and_b32 s18, s8, 32
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[18:19], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 6
+; GFX9-NEXT: s_and_b32 s20, s8, 64
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[20:21], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 7
+; GFX9-NEXT: s_and_b32 s22, s8, 0x80
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[22:23], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 8
+; GFX9-NEXT: s_and_b32 s24, s8, 0x100
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[24:25], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 9
+; GFX9-NEXT: s_and_b32 s26, s8, 0x200
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[26:27], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 10
+; GFX9-NEXT: s_and_b32 s28, s8, 0x400
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[28:29], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 11
+; GFX9-NEXT: s_and_b32 s30, s8, 0x800
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[30:31], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 12
+; GFX9-NEXT: s_and_b32 s34, s8, 0x1000
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[34:35], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 13
+; GFX9-NEXT: s_and_b32 s36, s8, 0x2000
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[36:37], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 14
+; GFX9-NEXT: s_and_b32 s38, s8, 0x4000
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[38:39], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 15
+; GFX9-NEXT: s_and_b32 s40, s8, 0x8000
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[40:41], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 16
+; GFX9-NEXT: s_and_b32 s42, s8, 0x10000
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[42:43], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 17
+; GFX9-NEXT: s_and_b32 s44, s8, 0x20000
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[44:45], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 18
+; GFX9-NEXT: s_and_b32 s46, s8, 0x40000
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[46:47], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 19
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_and_b32 s10, s8, 0x80000
+; GFX9-NEXT: s_mov_b32 s11, s5
+; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 20
+; GFX9-NEXT: s_and_b32 s12, s8, 0x100000
+; GFX9-NEXT: s_mov_b32 s13, s5
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 21
+; GFX9-NEXT: s_and_b32 s12, s8, 0x200000
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 22
+; GFX9-NEXT: s_and_b32 s12, s8, 0x400000
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 23
+; GFX9-NEXT: s_and_b32 s12, s8, 0x800000
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 24
+; GFX9-NEXT: s_and_b32 s12, s8, 0x1000000
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 25
+; GFX9-NEXT: s_and_b32 s12, s8, 0x2000000
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 26
+; GFX9-NEXT: s_and_b32 s12, s8, 0x4000000
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 27
+; GFX9-NEXT: s_and_b32 s12, s8, 0x8000000
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 28
+; GFX9-NEXT: s_and_b32 s12, s8, 0x10000000
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 29
+; GFX9-NEXT: s_and_b32 s12, s8, 0x20000000
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 30
+; GFX9-NEXT: s_and_b32 s12, s8, 2.0
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 31
+; GFX9-NEXT: s_cmp_gt_i32 s8, -1
+; GFX9-NEXT: s_cselect_b32 s5, 0, s5
+; GFX9-NEXT: s_cselect_b32 s4, 0, s4
+; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5]
+; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 31
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: test_clmulr_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_mov_b32 s6, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s6
+; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s8, s2
+; GFX10-NEXT: s_mov_b32 s9, s3
+; GFX10-NEXT: s_mov_b32 s3, 0
+; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX10-NEXT: s_mov_b32 s11, s3
+; GFX10-NEXT: s_mov_b32 s9, s3
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s4, v1
+; GFX10-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10-NEXT: s_bfe_i32 s5, s4, 0x10000
+; GFX10-NEXT: s_and_b32 s10, s4, 2
+; GFX10-NEXT: s_lshl_b64 s[12:13], s[2:3], 1
+; GFX10-NEXT: s_and_b32 s8, s5, s2
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_cselect_b32 s13, 0, s13
+; GFX10-NEXT: s_cselect_b32 s12, 0, s12
+; GFX10-NEXT: s_and_b32 s10, s4, 4
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 2
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 8
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 3
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 16
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 4
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 32
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 5
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 64
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 6
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x80
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 7
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x100
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 8
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x200
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 9
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x400
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 10
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x800
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 11
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x1000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 12
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x2000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 13
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x4000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 14
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x8000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 15
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x10000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 16
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x20000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 17
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x40000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 18
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x80000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 19
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x100000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 20
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x200000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 21
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x400000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 22
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x800000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 23
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x1000000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 24
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x2000000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 25
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x4000000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 26
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x8000000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 27
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x10000000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 28
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x20000000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 29
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 2.0
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 30
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s11, 0, s15
+; GFX10-NEXT: s_cselect_b32 s10, 0, s14
+; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 31
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT: s_cmp_gt_i32 s4, -1
+; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_cselect_b32 s3, 0, s3
+; GFX10-NEXT: s_cselect_b32 s2, 0, s2
+; GFX10-NEXT: s_xor_b64 s[2:3], s[8:9], s[2:3]
+; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 31
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_clmulr_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s6
+; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s2
+; GFX11-NEXT: s_mov_b32 s9, s3
+; GFX11-NEXT: s_mov_b32 s3, 0
+; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s9, s3
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-NEXT: s_bfe_i32 s5, s4, 0x10000
+; GFX11-NEXT: s_and_b32 s10, s4, 2
+; GFX11-NEXT: s_lshl_b64 s[12:13], s[2:3], 1
+; GFX11-NEXT: s_and_b32 s8, s5, s2
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_cselect_b32 s13, 0, s13
+; GFX11-NEXT: s_cselect_b32 s12, 0, s12
+; GFX11-NEXT: s_and_b32 s10, s4, 4
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 2
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 8
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 3
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 16
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 4
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 32
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 5
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 64
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 6
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x80
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 7
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x100
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 8
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x200
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 9
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x400
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 10
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x800
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 11
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x1000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 12
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x2000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 13
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x4000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 14
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x8000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 15
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x10000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 16
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x20000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 17
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x40000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 18
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x80000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 19
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x100000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 20
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x200000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 21
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x400000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 22
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x800000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 23
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x1000000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 24
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x2000000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 25
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x4000000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 26
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x8000000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 27
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x10000000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 28
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x20000000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 29
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 2.0
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 30
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s11, 0, s15
+; GFX11-NEXT: s_cselect_b32 s10, 0, s14
+; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 31
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; GFX11-NEXT: s_cmp_gt_i32 s4, -1
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_cselect_b32 s3, 0, s3
+; GFX11-NEXT: s_cselect_b32 s2, 0, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_xor_b64 s[2:3], s[8:9], s[2:3]
+; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_clmulr_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s2
+; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s3, 0
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT: s_mov_b32 s5, s3
+; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s11, s3
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s15, s3
+; GFX12-NEXT: s_mov_b32 s17, s3
+; GFX12-NEXT: s_mov_b32 s19, s3
+; GFX12-NEXT: s_mov_b32 s21, s3
+; GFX12-NEXT: s_mov_b32 s23, s3
+; GFX12-NEXT: s_mov_b32 s25, s3
+; GFX12-NEXT: s_mov_b32 s27, s3
+; GFX12-NEXT: s_mov_b32 s29, s3
+; GFX12-NEXT: s_mov_b32 s31, s3
+; GFX12-NEXT: s_mov_b32 s35, s3
+; GFX12-NEXT: s_mov_b32 s37, s3
+; GFX12-NEXT: s_mov_b32 s39, s3
+; GFX12-NEXT: s_mov_b32 s41, s3
+; GFX12-NEXT: s_mov_b32 s43, s3
+; GFX12-NEXT: s_mov_b32 s45, s3
+; GFX12-NEXT: s_mov_b32 s47, s3
+; GFX12-NEXT: s_mov_b32 s49, s3
+; GFX12-NEXT: s_mov_b32 s51, s3
+; GFX12-NEXT: s_mov_b32 s53, s3
+; GFX12-NEXT: s_mov_b32 s55, s3
+; GFX12-NEXT: s_mov_b32 s57, s3
+; GFX12-NEXT: s_mov_b32 s59, s3
+; GFX12-NEXT: s_mov_b32 s61, s3
+; GFX12-NEXT: s_mov_b32 s63, s3
+; GFX12-NEXT: s_mov_b32 s65, s3
+; GFX12-NEXT: s_mov_b32 s67, s3
+; GFX12-NEXT: s_mov_b32 s69, s3
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s33, v1
+; GFX12-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-NEXT: s_and_b32 s4, s33, 2
+; GFX12-NEXT: s_and_b32 s8, s33, 1
+; GFX12-NEXT: s_and_b32 s10, s33, 4
+; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], s[4:5]
+; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[8:9]
+; GFX12-NEXT: s_and_b32 s12, s33, 8
+; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[10:11]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5]
+; GFX12-NEXT: s_and_b32 s14, s33, 16
+; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[12:13]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
+; GFX12-NEXT: s_and_b32 s16, s33, 32
+; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[14:15]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13]
+; GFX12-NEXT: s_and_b32 s18, s33, 64
+; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[16:17]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
+; GFX12-NEXT: s_and_b32 s20, s33, 0x80
+; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[18:19]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX12-NEXT: s_and_b32 s22, s33, 0x100
+; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[20:21]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
+; GFX12-NEXT: s_and_b32 s24, s33, 0x200
+; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[22:23]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13]
+; GFX12-NEXT: s_and_b32 s26, s33, 0x400
+; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[24:25]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
+; GFX12-NEXT: s_and_b32 s28, s33, 0x800
+; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[26:27]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX12-NEXT: s_and_b32 s30, s33, 0x1000
+; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[28:29]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
+; GFX12-NEXT: s_and_b32 s34, s33, 0x2000
+; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[30:31]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13]
+; GFX12-NEXT: s_and_b32 s36, s33, 0x4000
+; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[34:35]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
+; GFX12-NEXT: s_and_b32 s38, s33, 0x8000
+; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[36:37]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX12-NEXT: s_and_b32 s40, s33, 0x10000
+; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[38:39]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
+; GFX12-NEXT: s_and_b32 s42, s33, 0x20000
+; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[40:41]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13]
+; GFX12-NEXT: s_and_b32 s44, s33, 0x40000
+; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[42:43]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
+; GFX12-NEXT: s_and_b32 s46, s33, 0x80000
+; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[44:45]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX12-NEXT: s_and_b32 s48, s33, 0x100000
+; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[46:47]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
+; GFX12-NEXT: s_and_b32 s50, s33, 0x200000
+; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[48:49]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13]
+; GFX12-NEXT: s_and_b32 s52, s33, 0x400000
+; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[50:51]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
+; GFX12-NEXT: s_and_b32 s54, s33, 0x800000
+; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[52:53]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX12-NEXT: s_and_b32 s56, s33, 0x1000000
+; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[54:55]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
+; GFX12-NEXT: s_and_b32 s58, s33, 0x2000000
+; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[56:57]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13]
+; GFX12-NEXT: s_and_b32 s60, s33, 0x4000000
+; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[58:59]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
+; GFX12-NEXT: s_and_b32 s62, s33, 0x8000000
+; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[60:61]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX12-NEXT: s_and_b32 s64, s33, 0x10000000
+; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[62:63]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
+; GFX12-NEXT: s_and_b32 s66, s33, 0x20000000
+; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[64:65]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13]
+; GFX12-NEXT: s_and_b32 s68, s33, 2.0
+; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[66:67]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
+; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[68:69]
+; GFX12-NEXT: s_and_b32 s12, s33, 0x80000000
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[12:13]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
+; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX12-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_lshr_b64 s[2:3], s[2:3], 31
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: test_clmulr_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s2
+; GFX1250-NEXT: s_mov_b32 s9, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null
+; GFX1250-NEXT: s_mov_b32 s5, s3
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b64 s[8:9], 0x80000000
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1250-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1250-NEXT: s_and_b64 s[10:11], s[4:5], 2
+; GFX1250-NEXT: s_and_b64 s[12:13], s[4:5], 1
+; GFX1250-NEXT: s_and_b64 s[14:15], s[4:5], 4
+; GFX1250-NEXT: s_mul_u64 s[10:11], s[2:3], s[10:11]
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[16:17], s[4:5], 8
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[14:15]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[12:13], s[10:11]
+; GFX1250-NEXT: s_and_b64 s[18:19], s[4:5], 16
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[16:17]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[20:21], s[4:5], 32
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[18:19]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[22:23], s[4:5], 64
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[20:21]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[24:25], s[4:5], 0x80
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[22:23]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[26:27], s[4:5], 0x100
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[24:25]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[28:29], s[4:5], 0x200
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[26:27]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[30:31], s[4:5], 0x400
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[28:29]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[34:35], s[4:5], 0x800
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[30:31]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[36:37], s[4:5], 0x1000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[34:35]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[38:39], s[4:5], 0x2000
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[36:37]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[40:41], s[4:5], 0x4000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[38:39]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[42:43], s[4:5], 0x8000
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[40:41]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[44:45], s[4:5], 0x10000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[42:43]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[46:47], s[4:5], 0x20000
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[44:45]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[48:49], s[4:5], 0x40000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[46:47]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[50:51], s[4:5], 0x80000
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[48:49]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[52:53], s[4:5], 0x100000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[50:51]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[54:55], s[4:5], 0x200000
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[52:53]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[56:57], s[4:5], 0x400000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[54:55]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[58:59], s[4:5], 0x800000
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[56:57]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[60:61], s[4:5], 0x1000000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[58:59]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[62:63], s[4:5], 0x2000000
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[60:61]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[64:65], s[4:5], 0x4000000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[62:63]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[66:67], s[4:5], 0x8000000
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[64:65]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[68:69], s[4:5], 0x10000000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[66:67]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[70:71], s[4:5], 0x20000000
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[68:69]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[72:73], s[4:5], 0x40000000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[70:71]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[72:73]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX1250-NEXT: s_xor_b64 s[8:9], s[10:11], s[14:15]
+; GFX1250-NEXT: s_mul_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: s_xor_b64 s[2:3], s[8:9], s[2:3]
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_lshr_b64 s[2:3], s[2:3], 31
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX1250-NEXT: s_endpgm
+;
+; EG-LABEL: test_clmulr_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @8
+; EG-NEXT: ALU 98, @11, KC0[], KC1[]
+; EG-NEXT: ALU 110, @110, KC0[], KC1[]
+; EG-NEXT: ALU 12, @221, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 8:
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: LSHR * T0.W, T0.X, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, PV.W, literal.x,
+; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT: LSHR * T1.W, T0.Y, literal.y,
+; EG-NEXT: 65280(9.147676e-41), 8(1.121039e-44)
+; EG-NEXT: AND_INT T1.X, PS, literal.x,
+; EG-NEXT: LSHR T1.Y, T0.Y, literal.y,
+; EG-NEXT: LSHL T1.Z, PV.W, literal.z,
+; EG-NEXT: LSHL T0.W, T0.Y, literal.y,
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: 65280(9.147676e-41), 24(3.363116e-44)
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T2.X, T0.X, literal.x,
+; EG-NEXT: LSHL T0.Y, PS, literal.y,
+; EG-NEXT: LSHL T2.Z, T0.X, literal.x,
+; EG-NEXT: OR_INT T0.W, PV.W, PV.Z,
+; EG-NEXT: OR_INT * T1.W, PV.X, PV.Y,
+; EG-NEXT: 24(3.363116e-44), 8(1.121039e-44)
+; EG-NEXT: OR_INT T1.Z, PV.W, PS,
+; EG-NEXT: OR_INT T0.W, PV.Z, PV.Y,
+; EG-NEXT: OR_INT * T1.W, T0.Z, PV.X,
+; EG-NEXT: OR_INT T0.Z, PV.W, PS,
+; EG-NEXT: AND_INT T0.W, PV.Z, literal.x,
+; EG-NEXT: LSHR * T1.W, PV.Z, literal.y,
+; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45)
+; EG-NEXT: AND_INT T0.Y, PS, literal.x,
+; EG-NEXT: LSHL T1.Z, PV.W, literal.y,
+; EG-NEXT: AND_INT T0.W, PV.Z, literal.x,
+; EG-NEXT: LSHR * T1.W, PV.Z, literal.y,
+; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45)
+; EG-NEXT: AND_INT T0.Z, PS, literal.x,
+; EG-NEXT: LSHL T0.W, PV.W, literal.y,
+; EG-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45)
+; EG-NEXT: AND_INT T1.Z, PS, literal.x,
+; EG-NEXT: LSHR T1.W, PS, literal.y,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45)
+; EG-NEXT: AND_INT T0.Y, PS, literal.x,
+; EG-NEXT: LSHR T0.Z, PS, literal.y,
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.y,
+; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45)
+; EG-NEXT: OR_INT T1.Z, PV.W, PS,
+; EG-NEXT: AND_INT T0.W, PV.Z, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.Y, literal.y,
+; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45)
+; EG-NEXT: OR_INT T0.Z, PV.W, PS,
+; EG-NEXT: AND_INT T0.W, PV.Z, literal.x,
+; EG-NEXT: LSHR * T1.W, PV.Z, 1,
+; EG-NEXT: 1431655765(1.466015e+13), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Y, PS, literal.x,
+; EG-NEXT: LSHL T1.Z, PV.W, 1,
+; EG-NEXT: AND_INT T0.W, PV.Z, literal.x,
+; EG-NEXT: LSHR * T1.W, PV.Z, 1,
+; EG-NEXT: 1431655765(1.466015e+13), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, PS, literal.x,
+; EG-NEXT: LSHL T0.W, PV.W, 1,
+; EG-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; EG-NEXT: 1431655765(1.466015e+13), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.W, PS, 1,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: AND_INT T3.W, T1.W, literal.x,
+; EG-NEXT: MULLO_INT * T0.X, PS, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.W, T1.W, literal.x,
+; EG-NEXT: MULLO_INT * T0.Y, T0.W, PV.W,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, T0.X, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.W,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T2.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T2.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T2.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T2.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T2.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 256(3.587324e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T2.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 512(7.174648e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT * T1.Z, T1.W, literal.x,
+; EG-NEXT: 1024(1.434930e-42), 0(0.000000e+00)
+; EG-NEXT: ALU clause starting at 110:
+; EG-NEXT: XOR_INT T3.W, T2.W, T0.X,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, T0.Z,
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, T1.Z,
+; EG-NEXT: 2048(2.869859e-42), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 4096(5.739719e-42), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 8192(1.147944e-41), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 16384(2.295887e-41), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 32768(4.591775e-41), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 65536(9.183550e-41), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 131072(1.836710e-40), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 262144(3.673420e-40), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 524288(7.346840e-40), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 1048576(1.469368e-39), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 2097152(2.938736e-39), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 4194304(5.877472e-39), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 8388608(1.175494e-38), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 16777216(2.350989e-38), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 33554432(9.403955e-38), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T4.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 67108864(1.504633e-36), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T4.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 134217728(3.851860e-34), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T4.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 268435456(2.524355e-29), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T4.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 536870912(1.084202e-19), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T4.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 1073741824(2.000000e+00), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T1.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: -2147483648(-0.000000e+00), 0(0.000000e+00)
+; EG-NEXT: XOR_INT T1.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: LSHR T0.Z, T3.W, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: AND_INT * T1.W, T3.W, literal.y,
+; EG-NEXT: 8(1.121039e-44), 65280(9.147676e-41)
+; EG-NEXT: LSHL T0.Y, PS, literal.x,
+; EG-NEXT: LSHL T1.Z, T2.W, literal.y,
+; EG-NEXT: LSHR T0.W, PV.W, literal.y,
+; EG-NEXT: AND_INT * T1.W, PV.Z, literal.z,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: 65280(9.147676e-41), 0(0.000000e+00)
+; EG-NEXT: OR_INT T0.W, PS, PV.W,
+; EG-NEXT: OR_INT * T1.W, PV.Z, PV.Y,
+; EG-NEXT: OR_INT * T0.W, PS, PV.W,
+; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
+; EG-NEXT: LSHR * T0.W, PV.W, literal.y,
+; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45)
+; EG-NEXT: AND_INT T0.W, PS, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.W, literal.y,
+; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
+; EG-NEXT: LSHR * T0.W, PV.W, literal.y,
+; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45)
+; EG-NEXT: ALU clause starting at 221:
+; EG-NEXT: AND_INT T0.W, T0.W, literal.x,
+; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
+; EG-NEXT: LSHR * T0.W, PV.W, 1,
+; EG-NEXT: 1431655765(1.466015e+13), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PS, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.W, 1,
+; EG-NEXT: 1431655765(1.466015e+13), 0(0.000000e+00)
+; EG-NEXT: OR_INT T0.X, PV.W, PS,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+ %a = load i32, ptr addrspace(1) %in
+ %b = load i32, ptr addrspace(1) %b_ptr
+ %a.ext = zext i32 %a to i64
+ %b.ext = zext i32 %b to i64
+ %clmul = call i64 @llvm.clmul.i64(i64 %a.ext, i64 %b.ext)
+ %res.ext = lshr i64 %clmul, 31
+ %res = trunc i64 %res.ext to i32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_clmulh_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; SI-LABEL: test_clmulh_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s6
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s7, 0
+; SI-NEXT: s_mov_b32 s21, s7
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: s_mov_b32 s15, s7
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: s_mov_b32 s17, s7
+; SI-NEXT: s_mov_b32 s19, s7
+; SI-NEXT: s_mov_b32 s23, s7
+; SI-NEXT: s_mov_b32 s25, s7
+; SI-NEXT: s_mov_b32 s27, s7
+; SI-NEXT: s_mov_b32 s29, s7
+; SI-NEXT: s_mov_b32 s31, s7
+; SI-NEXT: s_mov_b32 s35, s7
+; SI-NEXT: s_mov_b32 s37, s7
+; SI-NEXT: s_mov_b32 s39, s7
+; SI-NEXT: s_mov_b32 s41, s7
+; SI-NEXT: s_mov_b32 s43, s7
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_readfirstlane_b32 s33, v1
+; SI-NEXT: s_and_b32 s20, s33, 2
+; SI-NEXT: v_readfirstlane_b32 s6, v0
+; SI-NEXT: s_bfe_i32 s8, s33, 0x10000
+; SI-NEXT: v_cmp_eq_u64_e64 s[20:21], s[20:21], 0
+; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 1
+; SI-NEXT: s_and_b32 s8, s8, s6
+; SI-NEXT: s_and_b64 s[20:21], s[20:21], exec
+; SI-NEXT: s_cselect_b32 s21, 0, s5
+; SI-NEXT: s_cselect_b32 s20, 0, s4
+; SI-NEXT: s_and_b32 s14, s33, 4
+; SI-NEXT: s_xor_b64 s[20:21], s[8:9], s[20:21]
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[14:15], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 2
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s15, 0, s15
+; SI-NEXT: s_cselect_b32 s14, 0, s14
+; SI-NEXT: s_and_b32 s10, s33, 8
+; SI-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], 0
+; SI-NEXT: s_xor_b64 s[14:15], s[20:21], s[14:15]
+; SI-NEXT: s_lshl_b64 s[20:21], s[6:7], 3
+; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec
+; SI-NEXT: s_cselect_b32 s11, 0, s21
+; SI-NEXT: s_cselect_b32 s10, 0, s20
+; SI-NEXT: s_and_b32 s12, s33, 16
+; SI-NEXT: v_cmp_eq_u64_e64 s[12:13], s[12:13], 0
+; SI-NEXT: s_lshl_b64 s[20:21], s[6:7], 4
+; SI-NEXT: s_xor_b64 s[10:11], s[14:15], s[10:11]
+; SI-NEXT: s_and_b64 s[12:13], s[12:13], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s21
+; SI-NEXT: s_cselect_b32 s12, 0, s20
+; SI-NEXT: s_and_b32 s16, s33, 32
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[16:17], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 5
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s18, s33, 64
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[18:19], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 6
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s22, s33, 0x80
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[22:23], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 7
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s24, s33, 0x100
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[24:25], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 8
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s26, s33, 0x200
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[26:27], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 9
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s28, s33, 0x400
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[28:29], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 10
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s30, s33, 0x800
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[30:31], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 11
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s34, s33, 0x1000
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[34:35], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 12
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s36, s33, 0x2000
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[36:37], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 13
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s38, s33, 0x4000
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[38:39], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 14
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s40, s33, 0x8000
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[40:41], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 15
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s42, s33, 0x10000
+; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[42:43], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 16
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_cselect_b32 s13, 0, s15
+; SI-NEXT: s_cselect_b32 s12, 0, s14
+; SI-NEXT: s_and_b32 s4, s33, 0x20000
+; SI-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], 0
+; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 17
+; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: s_cselect_b32 s5, 0, s15
+; SI-NEXT: s_cselect_b32 s4, 0, s14
+; SI-NEXT: s_and_b32 s8, s33, 0x40000
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[12:13], s[6:7], 18
+; SI-NEXT: s_xor_b64 s[4:5], s[10:11], s[4:5]
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s13
+; SI-NEXT: s_cselect_b32 s8, 0, s12
+; SI-NEXT: s_and_b32 s10, s33, 0x80000
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[10:11], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 19
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x100000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 20
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x200000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 21
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x400000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 22
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x800000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 23
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x1000000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 24
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x2000000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 25
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x4000000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 26
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x8000000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 27
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x10000000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 28
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 0x20000000
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 29
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_and_b32 s8, s33, 2.0
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0
+; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 30
+; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT: s_cselect_b32 s9, 0, s11
+; SI-NEXT: s_cselect_b32 s8, 0, s10
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 31
+; SI-NEXT: s_cmp_gt_i32 s33, -1
+; SI-NEXT: s_cselect_b32 s7, 0, s7
+; SI-NEXT: s_cselect_b32 s6, 0, s6
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
+; SI-NEXT: v_mov_b32_e32 v0, s5
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: test_clmulh_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
+; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s7, 0
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_mov_b32 s9, s7
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s7
+; VI-NEXT: s_mov_b32 s17, s7
+; VI-NEXT: s_mov_b32 s19, s7
+; VI-NEXT: s_mov_b32 s21, s7
+; VI-NEXT: s_mov_b32 s23, s7
+; VI-NEXT: s_mov_b32 s25, s7
+; VI-NEXT: s_mov_b32 s27, s7
+; VI-NEXT: s_mov_b32 s29, s7
+; VI-NEXT: s_mov_b32 s31, s7
+; VI-NEXT: s_mov_b32 s35, s7
+; VI-NEXT: s_mov_b32 s37, s7
+; VI-NEXT: s_mov_b32 s39, s7
+; VI-NEXT: s_mov_b32 s41, s7
+; VI-NEXT: s_mov_b32 s43, s7
+; VI-NEXT: s_mov_b32 s45, s7
+; VI-NEXT: s_mov_b32 s47, s7
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_readfirstlane_b32 s4, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v0
+; VI-NEXT: s_bfe_i32 s5, s4, 0x10000
+; VI-NEXT: s_lshl_b64 s[48:49], s[6:7], 1
+; VI-NEXT: s_and_b32 s10, s4, 2
+; VI-NEXT: s_and_b32 s8, s5, s6
+; VI-NEXT: s_cmp_eq_u64 s[10:11], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s49
+; VI-NEXT: s_cselect_b32 s10, 0, s48
+; VI-NEXT: s_lshl_b64 s[48:49], s[6:7], 2
+; VI-NEXT: s_and_b32 s12, s4, 4
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s49
+; VI-NEXT: s_cselect_b32 s10, 0, s48
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 3
+; VI-NEXT: s_and_b32 s14, s4, 8
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[14:15], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 4
+; VI-NEXT: s_and_b32 s16, s4, 16
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[16:17], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 5
+; VI-NEXT: s_and_b32 s18, s4, 32
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[18:19], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 6
+; VI-NEXT: s_and_b32 s20, s4, 64
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[20:21], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 7
+; VI-NEXT: s_and_b32 s22, s4, 0x80
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[22:23], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 8
+; VI-NEXT: s_and_b32 s24, s4, 0x100
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[24:25], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 9
+; VI-NEXT: s_and_b32 s26, s4, 0x200
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[26:27], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 10
+; VI-NEXT: s_and_b32 s28, s4, 0x400
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[28:29], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 11
+; VI-NEXT: s_and_b32 s30, s4, 0x800
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[30:31], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 12
+; VI-NEXT: s_and_b32 s34, s4, 0x1000
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[34:35], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 13
+; VI-NEXT: s_and_b32 s36, s4, 0x2000
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[36:37], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 14
+; VI-NEXT: s_and_b32 s38, s4, 0x4000
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[38:39], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 15
+; VI-NEXT: s_and_b32 s40, s4, 0x8000
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[40:41], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 16
+; VI-NEXT: s_and_b32 s42, s4, 0x10000
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[42:43], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 17
+; VI-NEXT: s_and_b32 s44, s4, 0x20000
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[44:45], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 18
+; VI-NEXT: s_and_b32 s46, s4, 0x40000
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_cmp_eq_u64 s[46:47], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 19
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_and_b32 s10, s4, 0x80000
+; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_cmp_eq_u64 s[10:11], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s13
+; VI-NEXT: s_cselect_b32 s10, 0, s12
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 20
+; VI-NEXT: s_and_b32 s12, s4, 0x100000
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 21
+; VI-NEXT: s_and_b32 s12, s4, 0x200000
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 22
+; VI-NEXT: s_and_b32 s12, s4, 0x400000
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 23
+; VI-NEXT: s_and_b32 s12, s4, 0x800000
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 24
+; VI-NEXT: s_and_b32 s12, s4, 0x1000000
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 25
+; VI-NEXT: s_and_b32 s12, s4, 0x2000000
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 26
+; VI-NEXT: s_and_b32 s12, s4, 0x4000000
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 27
+; VI-NEXT: s_and_b32 s12, s4, 0x8000000
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 28
+; VI-NEXT: s_and_b32 s12, s4, 0x10000000
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 29
+; VI-NEXT: s_and_b32 s12, s4, 0x20000000
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 30
+; VI-NEXT: s_and_b32 s12, s4, 2.0
+; VI-NEXT: s_cmp_eq_u64 s[12:13], 0
+; VI-NEXT: s_cselect_b32 s11, 0, s11
+; VI-NEXT: s_cselect_b32 s10, 0, s10
+; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 31
+; VI-NEXT: s_cmp_gt_i32 s4, -1
+; VI-NEXT: s_cselect_b32 s5, 0, s7
+; VI-NEXT: s_cselect_b32 s4, 0, s6
+; VI-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, s5
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_clmulh_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s6, s2
+; GFX9-NEXT: s_mov_b32 s7, s3
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, s10
+; GFX9-NEXT: s_mov_b32 s5, s11
+; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: s_mov_b32 s0, s8
+; GFX9-NEXT: s_mov_b32 s5, 0
+; GFX9-NEXT: s_mov_b32 s11, s5
+; GFX9-NEXT: s_mov_b32 s7, s5
+; GFX9-NEXT: s_mov_b32 s13, s5
+; GFX9-NEXT: s_mov_b32 s15, s5
+; GFX9-NEXT: s_mov_b32 s17, s5
+; GFX9-NEXT: s_mov_b32 s19, s5
+; GFX9-NEXT: s_mov_b32 s21, s5
+; GFX9-NEXT: s_mov_b32 s23, s5
+; GFX9-NEXT: s_mov_b32 s25, s5
+; GFX9-NEXT: s_mov_b32 s27, s5
+; GFX9-NEXT: s_mov_b32 s29, s5
+; GFX9-NEXT: s_mov_b32 s31, s5
+; GFX9-NEXT: s_mov_b32 s35, s5
+; GFX9-NEXT: s_mov_b32 s37, s5
+; GFX9-NEXT: s_mov_b32 s39, s5
+; GFX9-NEXT: s_mov_b32 s41, s5
+; GFX9-NEXT: s_mov_b32 s43, s5
+; GFX9-NEXT: s_mov_b32 s45, s5
+; GFX9-NEXT: s_mov_b32 s47, s5
+; GFX9-NEXT: s_mov_b32 s1, s9
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_bfe_i32 s6, s8, 0x10000
+; GFX9-NEXT: s_lshl_b64 s[48:49], s[4:5], 1
+; GFX9-NEXT: s_and_b32 s10, s8, 2
+; GFX9-NEXT: s_and_b32 s6, s6, s4
+; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s49
+; GFX9-NEXT: s_cselect_b32 s10, 0, s48
+; GFX9-NEXT: s_lshl_b64 s[48:49], s[4:5], 2
+; GFX9-NEXT: s_and_b32 s12, s8, 4
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s49
+; GFX9-NEXT: s_cselect_b32 s10, 0, s48
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 3
+; GFX9-NEXT: s_and_b32 s14, s8, 8
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[14:15], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 4
+; GFX9-NEXT: s_and_b32 s16, s8, 16
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 5
+; GFX9-NEXT: s_and_b32 s18, s8, 32
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[18:19], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 6
+; GFX9-NEXT: s_and_b32 s20, s8, 64
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[20:21], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 7
+; GFX9-NEXT: s_and_b32 s22, s8, 0x80
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[22:23], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 8
+; GFX9-NEXT: s_and_b32 s24, s8, 0x100
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[24:25], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 9
+; GFX9-NEXT: s_and_b32 s26, s8, 0x200
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[26:27], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 10
+; GFX9-NEXT: s_and_b32 s28, s8, 0x400
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[28:29], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 11
+; GFX9-NEXT: s_and_b32 s30, s8, 0x800
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[30:31], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 12
+; GFX9-NEXT: s_and_b32 s34, s8, 0x1000
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[34:35], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 13
+; GFX9-NEXT: s_and_b32 s36, s8, 0x2000
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[36:37], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 14
+; GFX9-NEXT: s_and_b32 s38, s8, 0x4000
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[38:39], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 15
+; GFX9-NEXT: s_and_b32 s40, s8, 0x8000
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[40:41], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 16
+; GFX9-NEXT: s_and_b32 s42, s8, 0x10000
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[42:43], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 17
+; GFX9-NEXT: s_and_b32 s44, s8, 0x20000
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[44:45], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 18
+; GFX9-NEXT: s_and_b32 s46, s8, 0x40000
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_cmp_eq_u64 s[46:47], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 19
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_and_b32 s10, s8, 0x80000
+; GFX9-NEXT: s_mov_b32 s11, s5
+; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s13
+; GFX9-NEXT: s_cselect_b32 s10, 0, s12
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 20
+; GFX9-NEXT: s_and_b32 s12, s8, 0x100000
+; GFX9-NEXT: s_mov_b32 s13, s5
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 21
+; GFX9-NEXT: s_and_b32 s12, s8, 0x200000
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 22
+; GFX9-NEXT: s_and_b32 s12, s8, 0x400000
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 23
+; GFX9-NEXT: s_and_b32 s12, s8, 0x800000
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 24
+; GFX9-NEXT: s_and_b32 s12, s8, 0x1000000
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 25
+; GFX9-NEXT: s_and_b32 s12, s8, 0x2000000
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 26
+; GFX9-NEXT: s_and_b32 s12, s8, 0x4000000
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 27
+; GFX9-NEXT: s_and_b32 s12, s8, 0x8000000
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 28
+; GFX9-NEXT: s_and_b32 s12, s8, 0x10000000
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 29
+; GFX9-NEXT: s_and_b32 s12, s8, 0x20000000
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 30
+; GFX9-NEXT: s_and_b32 s12, s8, 2.0
+; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0
+; GFX9-NEXT: s_cselect_b32 s11, 0, s11
+; GFX9-NEXT: s_cselect_b32 s10, 0, s10
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 31
+; GFX9-NEXT: s_cmp_gt_i32 s8, -1
+; GFX9-NEXT: s_cselect_b32 s5, 0, s5
+; GFX9-NEXT: s_cselect_b32 s4, 0, s4
+; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: test_clmulh_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_mov_b32 s6, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s6
+; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s8, s2
+; GFX10-NEXT: s_mov_b32 s9, s3
+; GFX10-NEXT: s_mov_b32 s3, 0
+; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX10-NEXT: s_mov_b32 s11, s3
+; GFX10-NEXT: s_mov_b32 s9, s3
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s4, v1
+; GFX10-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10-NEXT: s_bfe_i32 s5, s4, 0x10000
+; GFX10-NEXT: s_and_b32 s10, s4, 2
+; GFX10-NEXT: s_lshl_b64 s[12:13], s[2:3], 1
+; GFX10-NEXT: s_and_b32 s8, s5, s2
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_cselect_b32 s13, 0, s13
+; GFX10-NEXT: s_cselect_b32 s12, 0, s12
+; GFX10-NEXT: s_and_b32 s10, s4, 4
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 2
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 8
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 3
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 16
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 4
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 32
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 5
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 64
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 6
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x80
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 7
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x100
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 8
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x200
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 9
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x400
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 10
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x800
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 11
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x1000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 12
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x2000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 13
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x4000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 14
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x8000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 15
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x10000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 16
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x20000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 17
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x40000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 18
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x80000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 19
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x100000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 20
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x200000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 21
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x400000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 22
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x800000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 23
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x1000000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 24
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x2000000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 25
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x4000000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 26
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x8000000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 27
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x10000000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 28
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 0x20000000
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 29
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s13, 0, s15
+; GFX10-NEXT: s_cselect_b32 s12, 0, s14
+; GFX10-NEXT: s_and_b32 s10, s4, 2.0
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 30
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT: s_cselect_b32 s11, 0, s15
+; GFX10-NEXT: s_cselect_b32 s10, 0, s14
+; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 31
+; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT: s_cmp_gt_i32 s4, -1
+; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_cselect_b32 s3, 0, s3
+; GFX10-NEXT: s_cselect_b32 s2, 0, s2
+; GFX10-NEXT: s_xor_b64 s[2:3], s[8:9], s[2:3]
+; GFX10-NEXT: v_mov_b32_e32 v0, s3
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_clmulh_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s6
+; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s2
+; GFX11-NEXT: s_mov_b32 s9, s3
+; GFX11-NEXT: s_mov_b32 s3, 0
+; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s9, s3
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-NEXT: s_bfe_i32 s5, s4, 0x10000
+; GFX11-NEXT: s_and_b32 s10, s4, 2
+; GFX11-NEXT: s_lshl_b64 s[12:13], s[2:3], 1
+; GFX11-NEXT: s_and_b32 s8, s5, s2
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_cselect_b32 s13, 0, s13
+; GFX11-NEXT: s_cselect_b32 s12, 0, s12
+; GFX11-NEXT: s_and_b32 s10, s4, 4
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 2
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 8
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 3
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 16
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 4
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 32
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 5
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 64
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 6
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x80
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 7
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x100
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 8
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x200
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 9
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x400
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 10
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x800
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 11
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x1000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 12
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x2000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 13
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x4000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 14
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x8000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 15
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x10000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 16
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x20000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 17
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x40000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 18
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x80000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 19
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x100000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 20
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x200000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 21
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x400000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 22
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x800000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 23
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x1000000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 24
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x2000000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 25
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x4000000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 26
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x8000000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 27
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x10000000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 28
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 0x20000000
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 29
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s13, 0, s15
+; GFX11-NEXT: s_cselect_b32 s12, 0, s14
+; GFX11-NEXT: s_and_b32 s10, s4, 2.0
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 30
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT: s_cselect_b32 s11, 0, s15
+; GFX11-NEXT: s_cselect_b32 s10, 0, s14
+; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 31
+; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11]
+; GFX11-NEXT: s_cmp_gt_i32 s4, -1
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_cselect_b32 s3, 0, s3
+; GFX11-NEXT: s_cselect_b32 s2, 0, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_xor_b64 s[2:3], s[8:9], s[2:3]
+; GFX11-NEXT: v_mov_b32_e32 v0, s3
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_clmulh_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s2
+; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s3, 0
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT: s_mov_b32 s5, s3
+; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s11, s3
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s15, s3
+; GFX12-NEXT: s_mov_b32 s17, s3
+; GFX12-NEXT: s_mov_b32 s19, s3
+; GFX12-NEXT: s_mov_b32 s21, s3
+; GFX12-NEXT: s_mov_b32 s23, s3
+; GFX12-NEXT: s_mov_b32 s25, s3
+; GFX12-NEXT: s_mov_b32 s27, s3
+; GFX12-NEXT: s_mov_b32 s29, s3
+; GFX12-NEXT: s_mov_b32 s31, s3
+; GFX12-NEXT: s_mov_b32 s35, s3
+; GFX12-NEXT: s_mov_b32 s37, s3
+; GFX12-NEXT: s_mov_b32 s39, s3
+; GFX12-NEXT: s_mov_b32 s41, s3
+; GFX12-NEXT: s_mov_b32 s43, s3
+; GFX12-NEXT: s_mov_b32 s45, s3
+; GFX12-NEXT: s_mov_b32 s47, s3
+; GFX12-NEXT: s_mov_b32 s49, s3
+; GFX12-NEXT: s_mov_b32 s51, s3
+; GFX12-NEXT: s_mov_b32 s53, s3
+; GFX12-NEXT: s_mov_b32 s55, s3
+; GFX12-NEXT: s_mov_b32 s57, s3
+; GFX12-NEXT: s_mov_b32 s59, s3
+; GFX12-NEXT: s_mov_b32 s61, s3
+; GFX12-NEXT: s_mov_b32 s63, s3
+; GFX12-NEXT: s_mov_b32 s65, s3
+; GFX12-NEXT: s_mov_b32 s67, s3
+; GFX12-NEXT: s_mov_b32 s69, s3
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s33, v1
+; GFX12-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-NEXT: s_and_b32 s4, s33, 2
+; GFX12-NEXT: s_and_b32 s8, s33, 1
+; GFX12-NEXT: s_and_b32 s10, s33, 4
+; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], s[4:5]
+; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[8:9]
+; GFX12-NEXT: s_and_b32 s12, s33, 8
+; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[10:11]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5]
+; GFX12-NEXT: s_and_b32 s14, s33, 16
+; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[12:13]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
+; GFX12-NEXT: s_and_b32 s16, s33, 32
+; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[14:15]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13]
+; GFX12-NEXT: s_and_b32 s18, s33, 64
+; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[16:17]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
+; GFX12-NEXT: s_and_b32 s20, s33, 0x80
+; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[18:19]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX12-NEXT: s_and_b32 s22, s33, 0x100
+; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[20:21]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
+; GFX12-NEXT: s_and_b32 s24, s33, 0x200
+; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[22:23]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13]
+; GFX12-NEXT: s_and_b32 s26, s33, 0x400
+; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[24:25]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
+; GFX12-NEXT: s_and_b32 s28, s33, 0x800
+; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[26:27]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX12-NEXT: s_and_b32 s30, s33, 0x1000
+; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[28:29]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
+; GFX12-NEXT: s_and_b32 s34, s33, 0x2000
+; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[30:31]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13]
+; GFX12-NEXT: s_and_b32 s36, s33, 0x4000
+; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[34:35]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
+; GFX12-NEXT: s_and_b32 s38, s33, 0x8000
+; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[36:37]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX12-NEXT: s_and_b32 s40, s33, 0x10000
+; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[38:39]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
+; GFX12-NEXT: s_and_b32 s42, s33, 0x20000
+; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[40:41]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13]
+; GFX12-NEXT: s_and_b32 s44, s33, 0x40000
+; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[42:43]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
+; GFX12-NEXT: s_and_b32 s46, s33, 0x80000
+; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[44:45]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX12-NEXT: s_and_b32 s48, s33, 0x100000
+; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[46:47]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
+; GFX12-NEXT: s_and_b32 s50, s33, 0x200000
+; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[48:49]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13]
+; GFX12-NEXT: s_and_b32 s52, s33, 0x400000
+; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[50:51]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
+; GFX12-NEXT: s_and_b32 s54, s33, 0x800000
+; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[52:53]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX12-NEXT: s_and_b32 s56, s33, 0x1000000
+; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[54:55]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
+; GFX12-NEXT: s_and_b32 s58, s33, 0x2000000
+; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[56:57]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13]
+; GFX12-NEXT: s_and_b32 s60, s33, 0x4000000
+; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[58:59]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
+; GFX12-NEXT: s_and_b32 s62, s33, 0x8000000
+; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[60:61]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX12-NEXT: s_and_b32 s64, s33, 0x10000000
+; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[62:63]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
+; GFX12-NEXT: s_and_b32 s66, s33, 0x20000000
+; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[64:65]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13]
+; GFX12-NEXT: s_and_b32 s68, s33, 2.0
+; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[66:67]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
+; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[68:69]
+; GFX12-NEXT: s_and_b32 s12, s33, 0x80000000
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
+; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[12:13]
+; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: test_clmulh_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s2
+; GFX1250-NEXT: s_mov_b32 s9, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null
+; GFX1250-NEXT: s_mov_b32 s5, s3
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b64 s[8:9], 0x80000000
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1250-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1250-NEXT: s_and_b64 s[10:11], s[4:5], 2
+; GFX1250-NEXT: s_and_b64 s[12:13], s[4:5], 1
+; GFX1250-NEXT: s_and_b64 s[14:15], s[4:5], 4
+; GFX1250-NEXT: s_mul_u64 s[10:11], s[2:3], s[10:11]
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[16:17], s[4:5], 8
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[14:15]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[12:13], s[10:11]
+; GFX1250-NEXT: s_and_b64 s[18:19], s[4:5], 16
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[16:17]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[20:21], s[4:5], 32
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[18:19]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[22:23], s[4:5], 64
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[20:21]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[24:25], s[4:5], 0x80
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[22:23]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[26:27], s[4:5], 0x100
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[24:25]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[28:29], s[4:5], 0x200
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[26:27]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[30:31], s[4:5], 0x400
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[28:29]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[34:35], s[4:5], 0x800
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[30:31]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[36:37], s[4:5], 0x1000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[34:35]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[38:39], s[4:5], 0x2000
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[36:37]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[40:41], s[4:5], 0x4000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[38:39]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[42:43], s[4:5], 0x8000
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[40:41]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[44:45], s[4:5], 0x10000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[42:43]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[46:47], s[4:5], 0x20000
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[44:45]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[48:49], s[4:5], 0x40000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[46:47]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[50:51], s[4:5], 0x80000
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[48:49]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[52:53], s[4:5], 0x100000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[50:51]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[54:55], s[4:5], 0x200000
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[52:53]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[56:57], s[4:5], 0x400000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[54:55]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[58:59], s[4:5], 0x800000
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[56:57]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[60:61], s[4:5], 0x1000000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[58:59]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[62:63], s[4:5], 0x2000000
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[60:61]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[64:65], s[4:5], 0x4000000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[62:63]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[66:67], s[4:5], 0x8000000
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[64:65]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[68:69], s[4:5], 0x10000000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[66:67]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_and_b64 s[70:71], s[4:5], 0x20000000
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[68:69]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[72:73], s[4:5], 0x40000000
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[70:71]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15]
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[72:73]
+; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX1250-NEXT: s_xor_b64 s[8:9], s[10:11], s[14:15]
+; GFX1250-NEXT: s_mul_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: s_xor_b64 s[2:3], s[8:9], s[2:3]
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX1250-NEXT: s_endpgm
+;
+; EG-LABEL: test_clmulh_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @8
+; EG-NEXT: ALU 98, @11, KC0[], KC1[]
+; EG-NEXT: ALU 110, @110, KC0[], KC1[]
+; EG-NEXT: ALU 13, @221, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 8:
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: LSHR * T0.W, T0.X, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, PV.W, literal.x,
+; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT: LSHR * T1.W, T0.Y, literal.y,
+; EG-NEXT: 65280(9.147676e-41), 8(1.121039e-44)
+; EG-NEXT: AND_INT T1.X, PS, literal.x,
+; EG-NEXT: LSHR T1.Y, T0.Y, literal.y,
+; EG-NEXT: LSHL T1.Z, PV.W, literal.z,
+; EG-NEXT: LSHL T0.W, T0.Y, literal.y,
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: 65280(9.147676e-41), 24(3.363116e-44)
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T2.X, T0.X, literal.x,
+; EG-NEXT: LSHL T0.Y, PS, literal.y,
+; EG-NEXT: LSHL T2.Z, T0.X, literal.x,
+; EG-NEXT: OR_INT T0.W, PV.W, PV.Z,
+; EG-NEXT: OR_INT * T1.W, PV.X, PV.Y,
+; EG-NEXT: 24(3.363116e-44), 8(1.121039e-44)
+; EG-NEXT: OR_INT T1.Z, PV.W, PS,
+; EG-NEXT: OR_INT T0.W, PV.Z, PV.Y,
+; EG-NEXT: OR_INT * T1.W, T0.Z, PV.X,
+; EG-NEXT: OR_INT T0.Z, PV.W, PS,
+; EG-NEXT: AND_INT T0.W, PV.Z, literal.x,
+; EG-NEXT: LSHR * T1.W, PV.Z, literal.y,
+; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45)
+; EG-NEXT: AND_INT T0.Y, PS, literal.x,
+; EG-NEXT: LSHL T1.Z, PV.W, literal.y,
+; EG-NEXT: AND_INT T0.W, PV.Z, literal.x,
+; EG-NEXT: LSHR * T1.W, PV.Z, literal.y,
+; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45)
+; EG-NEXT: AND_INT T0.Z, PS, literal.x,
+; EG-NEXT: LSHL T0.W, PV.W, literal.y,
+; EG-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45)
+; EG-NEXT: AND_INT T1.Z, PS, literal.x,
+; EG-NEXT: LSHR T1.W, PS, literal.y,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45)
+; EG-NEXT: AND_INT T0.Y, PS, literal.x,
+; EG-NEXT: LSHR T0.Z, PS, literal.y,
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.y,
+; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45)
+; EG-NEXT: OR_INT T1.Z, PV.W, PS,
+; EG-NEXT: AND_INT T0.W, PV.Z, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.Y, literal.y,
+; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45)
+; EG-NEXT: OR_INT T0.Z, PV.W, PS,
+; EG-NEXT: AND_INT T0.W, PV.Z, literal.x,
+; EG-NEXT: LSHR * T1.W, PV.Z, 1,
+; EG-NEXT: 1431655765(1.466015e+13), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Y, PS, literal.x,
+; EG-NEXT: LSHL T1.Z, PV.W, 1,
+; EG-NEXT: AND_INT T0.W, PV.Z, literal.x,
+; EG-NEXT: LSHR * T1.W, PV.Z, 1,
+; EG-NEXT: 1431655765(1.466015e+13), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, PS, literal.x,
+; EG-NEXT: LSHL T0.W, PV.W, 1,
+; EG-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; EG-NEXT: 1431655765(1.466015e+13), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.W, PS, 1,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: AND_INT T3.W, T1.W, literal.x,
+; EG-NEXT: MULLO_INT * T0.X, PS, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.W, T1.W, literal.x,
+; EG-NEXT: MULLO_INT * T0.Y, T0.W, PV.W,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, T0.X, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.W,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T2.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T2.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T2.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T2.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T2.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 256(3.587324e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T2.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 512(7.174648e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT * T1.Z, T1.W, literal.x,
+; EG-NEXT: 1024(1.434930e-42), 0(0.000000e+00)
+; EG-NEXT: ALU clause starting at 110:
+; EG-NEXT: XOR_INT T3.W, T2.W, T0.X,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, T0.Z,
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, T1.Z,
+; EG-NEXT: 2048(2.869859e-42), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 4096(5.739719e-42), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 8192(1.147944e-41), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 16384(2.295887e-41), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 32768(4.591775e-41), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 65536(9.183550e-41), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 131072(1.836710e-40), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 262144(3.673420e-40), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 524288(7.346840e-40), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 1048576(1.469368e-39), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 2097152(2.938736e-39), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 4194304(5.877472e-39), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 8388608(1.175494e-38), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 16777216(2.350989e-38), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T3.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 33554432(9.403955e-38), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T4.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 67108864(1.504633e-36), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T4.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 134217728(3.851860e-34), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T4.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 268435456(2.524355e-29), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T4.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 536870912(1.084202e-19), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T4.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: 1073741824(2.000000e+00), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T1.W, literal.x,
+; EG-NEXT: XOR_INT T1.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: -2147483648(-0.000000e+00), 0(0.000000e+00)
+; EG-NEXT: XOR_INT T1.W, PV.W, PS,
+; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z,
+; EG-NEXT: LSHR T0.Z, T3.W, literal.x,
+; EG-NEXT: XOR_INT T0.W, PV.W, PS,
+; EG-NEXT: AND_INT * T1.W, T3.W, literal.y,
+; EG-NEXT: 8(1.121039e-44), 65280(9.147676e-41)
+; EG-NEXT: LSHL T0.Y, PS, literal.x,
+; EG-NEXT: LSHL T1.Z, T2.W, literal.y,
+; EG-NEXT: LSHR T0.W, PV.W, literal.y,
+; EG-NEXT: AND_INT * T1.W, PV.Z, literal.z,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: 65280(9.147676e-41), 0(0.000000e+00)
+; EG-NEXT: OR_INT T0.W, PS, PV.W,
+; EG-NEXT: OR_INT * T1.W, PV.Z, PV.Y,
+; EG-NEXT: OR_INT * T0.W, PS, PV.W,
+; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
+; EG-NEXT: LSHR * T0.W, PV.W, literal.y,
+; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45)
+; EG-NEXT: AND_INT T0.W, PS, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.W, literal.y,
+; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
+; EG-NEXT: LSHR * T0.W, PV.W, literal.y,
+; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45)
+; EG-NEXT: ALU clause starting at 221:
+; EG-NEXT: AND_INT T0.W, T0.W, literal.x,
+; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
+; EG-NEXT: LSHR * T0.W, PV.W, 1,
+; EG-NEXT: 1431655765(1.466015e+13), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PS, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.W, 1,
+; EG-NEXT: 1431655764(1.466015e+13), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: LSHR T0.X, PV.W, 1,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+ %a = load i32, ptr addrspace(1) %in
+ %b = load i32, ptr addrspace(1) %b_ptr
+ %a.ext = zext i32 %a to i64
+ %b.ext = zext i32 %b to i64
+ %clmul = call i64 @llvm.clmul.i64(i64 %a.ext, i64 %b.ext)
+ %res.ext = lshr i64 %clmul, 32
+ %res = trunc i64 %res.ext to i32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone}
More information about the llvm-commits
mailing list