[llvm] 79cda46 - AMDGPU/GlobalISel: Add baseline test for mul

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Sun Mar 15 11:54:01 PDT 2020


Author: Matt Arsenault
Date: 2020-03-15T14:53:51-04:00
New Revision: 79cda46e49bbf4e7cf7fd90f71cffcd407298983

URL: https://github.com/llvm/llvm-project/commit/79cda46e49bbf4e7cf7fd90f71cffcd407298983
DIFF: https://github.com/llvm/llvm-project/commit/79cda46e49bbf4e7cf7fd90f71cffcd407298983.diff

LOG: AMDGPU/GlobalISel: Add baseline test for mul

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 4d37f5b67e02..070bfaf8ff03 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -1,96 +1,2339 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
 define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
+; GFX7-LABEL: s_mul_i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mul_i32 s0, s0, s1
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_mul_i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_mov_b32 s2, 0xffff
+; GFX8-NEXT:    s_and_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s1, s1, s2
+; GFX8-NEXT:    s_mul_i32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_mul_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NEXT:    s_and_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s1, s1, s2
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
   %result = mul i16 %num, %den
   ret i16 %result
 }
 
 define i16 @v_mul_i16(i16 %num, i16 %den) {
+; GFX7-LABEL: v_mul_i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, 0xffff
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_mul_i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_mul_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i16 %num, %den
   ret i16 %result
 }
 
 define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inreg zeroext %den) {
+; GFX7-LABEL: s_mul_i16_zeroext:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mul_i32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_mul_i16_zeroext:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_mov_b32 s2, 0xffff
+; GFX8-NEXT:    s_and_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s1, s1, s2
+; GFX8-NEXT:    s_mul_i32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s0, s0, s2
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_mul_i16_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NEXT:    s_and_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s1, s1, s2
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s0, s0, s2
+; GFX9-NEXT:    ; return to shader part epilog
   %result = mul i16 %num, %den
   ret i16 %result
 }
 
 define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
+; GFX7-LABEL: v_mul_i16_zeroext:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, 0xffff
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_mul_i16_zeroext:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_mul_i16_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i16 %num, %den
   ret i16 %result
 }
 
 define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) {
+; GFX7-LABEL: s_mul_i16_signext:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mul_i32 s0, s0, s1
+; GFX7-NEXT:    s_sext_i32_i16 s0, s0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_mul_i16_signext:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_mov_b32 s2, 0xffff
+; GFX8-NEXT:    s_and_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s1, s1, s2
+; GFX8-NEXT:    s_mul_i32 s0, s0, s1
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_mul_i16_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NEXT:    s_and_b32 s0, s0, s2
+; GFX9-NEXT:    s_and_b32 s1, s1, s2
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    s_sext_i32_i16 s0, s0
+; GFX9-NEXT:    ; return to shader part epilog
   %result = mul i16 %num, %den
   ret i16 %result
 }
 
 define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
+; GFX7-LABEL: v_mul_i16_signext:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, 0xffff
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_mul_i16_signext:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_mul_i16_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i16 %num, %den
   ret i16 %result
 }
 
 define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
+; GCN-LABEL: s_mul_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mul_i32 s0, s0, s1
+; GCN-NEXT:    ; return to shader part epilog
   %result = mul i32 %num, %den
   ret i32 %result
 }
 
 define i32 @v_mul_i32(i32 %num, i32 %den) {
+; GCN-LABEL: v_mul_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i32 %num, %den
   ret i32 %result
 }
 
 define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
+; GCN-LABEL: s_mul_v2i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mul_i32 s0, s0, s2
+; GCN-NEXT:    s_mul_i32 s1, s1, s3
+; GCN-NEXT:    ; return to shader part epilog
   %result = mul <2 x i32> %num, %den
   ret <2 x i32> %result
 }
 
 define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
+; GCN-LABEL: v_mul_v2i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GCN-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = mul <2 x i32> %num, %den
   ret <2 x i32> %result
 }
 
 define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
+; GFX7-LABEL: s_mul_i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX7-NEXT:    s_mul_i32 s4, s0, s2
+; GFX7-NEXT:    s_mul_i32 s1, s1, s2
+; GFX7-NEXT:    s_mul_i32 s0, s0, s3
+; GFX7-NEXT:    s_add_i32 s1, s1, s0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX7-NEXT:    s_mov_b32 s0, s4
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_mul_i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX8-NEXT:    s_mul_i32 s4, s0, s2
+; GFX8-NEXT:    s_mul_i32 s1, s1, s2
+; GFX8-NEXT:    s_mul_i32 s0, s0, s3
+; GFX8-NEXT:    s_add_i32 s1, s1, s0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s1, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX8-NEXT:    s_mov_b32 s0, s4
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_mul_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mul_i32 s1, s1, s2
+; GFX9-NEXT:    s_mul_i32 s3, s0, s3
+; GFX9-NEXT:    s_mul_i32 s4, s0, s2
+; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s2
+; GFX9-NEXT:    s_add_i32 s1, s1, s3
+; GFX9-NEXT:    s_add_i32 s1, s1, s0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    ; return to shader part epilog
   %result = mul i64 %num, %den
   ret i64 %result
 }
 
 define i64 @v_mul_i64(i64 %num, i64 %den) {
+; GFX7-LABEL: v_mul_i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_lo_u32 v4, v0, v3
+; GFX7-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX7-NEXT:    v_mul_lo_u32 v3, v0, v2
+; GFX7-NEXT:    v_mul_hi_u32 v0, v0, v2
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_mul_i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_lo_u32 v4, v0, v3
+; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX8-NEXT:    v_mul_lo_u32 v3, v0, v2
+; GFX8-NEXT:    v_mul_hi_u32 v0, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_mul_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v3
+; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v2
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GFX9-NEXT:    v_add3_u32 v1, v1, v3, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i64 %num, %den
   ret i64 %result
 }
 
 define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
+; GFX7-LABEL: s_mul_i96:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_mov_b32_e32 v0, s3
+; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    s_mul_i32 s7, s1, s3
+; GFX7-NEXT:    s_mul_i32 s8, s0, s4
+; GFX7-NEXT:    s_add_u32 s7, s7, s8
+; GFX7-NEXT:    v_mov_b32_e32 v3, s4
+; GFX7-NEXT:    v_mul_hi_u32 v2, v2, s3
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s7, v0
+; GFX7-NEXT:    s_mul_i32 s7, s1, s4
+; GFX7-NEXT:    s_mul_i32 s2, s2, s3
+; GFX7-NEXT:    v_mul_hi_u32 v3, s0, v3
+; GFX7-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX7-NEXT:    s_mul_i32 s6, s0, s3
+; GFX7-NEXT:    s_mul_i32 s5, s0, s5
+; GFX7-NEXT:    s_add_i32 s0, s2, s7
+; GFX7-NEXT:    s_lshl_b32 s8, s8, 31
+; GFX7-NEXT:    s_add_i32 s0, s0, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
+; GFX7-NEXT:    s_lshr_b32 s8, s8, 31
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, s8, v1
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX7-NEXT:    s_mov_b32 s0, s6
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_mul_i96:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    s_mul_i32 s7, s1, s3
+; GFX8-NEXT:    s_mul_i32 s8, s0, s4
+; GFX8-NEXT:    s_add_u32 s7, s7, s8
+; GFX8-NEXT:    v_mov_b32_e32 v3, s4
+; GFX8-NEXT:    v_mul_hi_u32 v2, v2, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s7, v0
+; GFX8-NEXT:    s_mul_i32 s7, s1, s4
+; GFX8-NEXT:    s_mul_i32 s2, s2, s3
+; GFX8-NEXT:    v_mul_hi_u32 v3, s0, v3
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_mul_i32 s6, s0, s3
+; GFX8-NEXT:    s_mul_i32 s5, s0, s5
+; GFX8-NEXT:    s_add_i32 s0, s2, s7
+; GFX8-NEXT:    s_lshl_b32 s8, s8, 31
+; GFX8-NEXT:    s_add_i32 s0, s0, s5
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT:    s_lshr_b32 s8, s8, 31
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s8, v1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX8-NEXT:    s_mov_b32 s0, s6
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_mul_i96:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mul_i32 s7, s1, s3
+; GFX9-NEXT:    s_mul_i32 s8, s0, s4
+; GFX9-NEXT:    s_add_u32 s7, s7, s8
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s8, s8, 31
+; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s3
+; GFX9-NEXT:    s_lshr_b32 s8, s8, 31
+; GFX9-NEXT:    s_add_u32 s7, s7, s9
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 31
+; GFX9-NEXT:    s_lshr_b32 s9, s9, 31
+; GFX9-NEXT:    s_add_i32 s8, s8, s9
+; GFX9-NEXT:    s_mul_i32 s9, s1, s4
+; GFX9-NEXT:    s_mul_i32 s2, s2, s3
+; GFX9-NEXT:    s_mul_i32 s5, s0, s5
+; GFX9-NEXT:    s_add_i32 s2, s2, s9
+; GFX9-NEXT:    s_mul_hi_u32 s1, s1, s3
+; GFX9-NEXT:    s_add_i32 s2, s2, s5
+; GFX9-NEXT:    s_mul_i32 s6, s0, s3
+; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s4
+; GFX9-NEXT:    s_add_i32 s1, s2, s1
+; GFX9-NEXT:    s_add_i32 s0, s1, s0
+; GFX9-NEXT:    s_add_i32 s2, s0, s8
+; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s1, s7
+; GFX9-NEXT:    ; return to shader part epilog
   %result = mul i96 %num, %den
   %cast = bitcast i96 %result to <3 x i32>
   ret <3 x i32> %cast
 }
 
 define i96 @v_mul_i96(i96 %num, i96 %den) {
+; GFX7-LABEL: v_mul_i96:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_lo_u32 v7, v1, v3
+; GFX7-NEXT:    v_mul_lo_u32 v8, v0, v4
+; GFX7-NEXT:    v_mul_hi_u32 v9, v0, v3
+; GFX7-NEXT:    v_mul_lo_u32 v2, v2, v3
+; GFX7-NEXT:    v_mul_lo_u32 v5, v0, v5
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; GFX7-NEXT:    v_mul_lo_u32 v9, v1, v4
+; GFX7-NEXT:    v_mul_hi_u32 v1, v1, v3
+; GFX7-NEXT:    v_mul_lo_u32 v6, v0, v3
+; GFX7-NEXT:    v_mul_hi_u32 v0, v0, v4
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v0, v8
+; GFX7-NEXT:    v_mov_b32_e32 v0, v6
+; GFX7-NEXT:    v_mov_b32_e32 v1, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_mul_i96:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_lo_u32 v7, v1, v3
+; GFX8-NEXT:    v_mul_lo_u32 v8, v0, v4
+; GFX8-NEXT:    v_mul_hi_u32 v9, v0, v3
+; GFX8-NEXT:    v_mul_lo_u32 v2, v2, v3
+; GFX8-NEXT:    v_mul_lo_u32 v5, v0, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
+; GFX8-NEXT:    v_mul_lo_u32 v9, v1, v4
+; GFX8-NEXT:    v_mul_hi_u32 v1, v1, v3
+; GFX8-NEXT:    v_mul_lo_u32 v6, v0, v3
+; GFX8-NEXT:    v_mul_hi_u32 v0, v0, v4
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v9
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v8
+; GFX8-NEXT:    v_mov_b32_e32 v0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v1, v7
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_mul_i96:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v3
+; GFX9-NEXT:    v_mul_lo_u32 v8, v0, v4
+; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v3
+; GFX9-NEXT:    v_mul_lo_u32 v10, v1, v4
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v3
+; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v5
+; GFX9-NEXT:    v_mul_hi_u32 v1, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
+; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v3
+; GFX9-NEXT:    v_mul_hi_u32 v0, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v10
+; GFX9-NEXT:    v_add_u32_e32 v3, v8, v9
+; GFX9-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-NEXT:    v_add3_u32 v2, v1, v0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i96 %num, %den
   ret i96 %result
 }
 
 define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
+; GFX7-LABEL: s_mul_i128:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX7-NEXT:    s_mul_i32 s9, s1, s4
+; GFX7-NEXT:    s_mul_i32 s10, s0, s5
+; GFX7-NEXT:    s_add_u32 s9, s9, s10
+; GFX7-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX7-NEXT:    s_lshl_b32 s10, s10, 31
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s9, v0
+; GFX7-NEXT:    s_lshr_b32 s10, s10, 31
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, s10, v1
+; GFX7-NEXT:    s_mul_i32 s9, s2, s4
+; GFX7-NEXT:    s_mul_i32 s10, s1, s5
+; GFX7-NEXT:    s_add_u32 s9, s9, s10
+; GFX7-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    s_lshl_b32 s10, s10, 31
+; GFX7-NEXT:    s_mul_i32 s11, s0, s6
+; GFX7-NEXT:    s_lshr_b32 s10, s10, 31
+; GFX7-NEXT:    v_mul_hi_u32 v2, v2, s4
+; GFX7-NEXT:    s_add_u32 s9, s9, s11
+; GFX7-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    s_lshl_b32 s11, s11, 31
+; GFX7-NEXT:    v_mul_hi_u32 v4, s0, v3
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s9, v2
+; GFX7-NEXT:    s_lshr_b32 s11, s11, 31
+; GFX7-NEXT:    s_add_i32 s10, s10, s11
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, s10, v5
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v4, s2
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
+; GFX7-NEXT:    s_mul_i32 s5, s2, s5
+; GFX7-NEXT:    s_mul_i32 s3, s3, s4
+; GFX7-NEXT:    v_mul_hi_u32 v4, v4, s4
+; GFX7-NEXT:    s_mul_i32 s8, s0, s4
+; GFX7-NEXT:    s_mul_i32 s9, s1, s6
+; GFX7-NEXT:    v_mul_hi_u32 v3, s1, v3
+; GFX7-NEXT:    s_mul_i32 s7, s0, s7
+; GFX7-NEXT:    v_mul_hi_u32 v5, s0, v5
+; GFX7-NEXT:    s_add_i32 s0, s3, s5
+; GFX7-NEXT:    s_add_i32 s0, s0, s9
+; GFX7-NEXT:    s_add_i32 s0, s0, s7
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, s0, v4
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX7-NEXT:    s_mov_b32 s0, s8
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_mul_i128:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX8-NEXT:    s_mul_i32 s9, s1, s4
+; GFX8-NEXT:    s_mul_i32 s10, s0, s5
+; GFX8-NEXT:    s_add_u32 s9, s9, s10
+; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX8-NEXT:    s_lshl_b32 s10, s10, 31
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s9, v0
+; GFX8-NEXT:    s_lshr_b32 s10, s10, 31
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s10, v1
+; GFX8-NEXT:    s_mul_i32 s9, s2, s4
+; GFX8-NEXT:    s_mul_i32 s10, s1, s5
+; GFX8-NEXT:    s_add_u32 s9, s9, s10
+; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    s_lshl_b32 s10, s10, 31
+; GFX8-NEXT:    s_mul_i32 s11, s0, s6
+; GFX8-NEXT:    s_lshr_b32 s10, s10, 31
+; GFX8-NEXT:    v_mul_hi_u32 v2, v2, s4
+; GFX8-NEXT:    s_add_u32 s9, s9, s11
+; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    s_lshl_b32 s11, s11, 31
+; GFX8-NEXT:    v_mul_hi_u32 v4, s0, v3
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s9, v2
+; GFX8-NEXT:    s_lshr_b32 s11, s11, 31
+; GFX8-NEXT:    s_add_i32 s10, s10, s11
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s10, v5
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    s_mul_i32 s5, s2, s5
+; GFX8-NEXT:    s_mul_i32 s3, s3, s4
+; GFX8-NEXT:    v_mul_hi_u32 v4, v4, s4
+; GFX8-NEXT:    s_mul_i32 s8, s0, s4
+; GFX8-NEXT:    s_mul_i32 s9, s1, s6
+; GFX8-NEXT:    v_mul_hi_u32 v3, s1, v3
+; GFX8-NEXT:    s_mul_i32 s7, s0, s7
+; GFX8-NEXT:    v_mul_hi_u32 v5, s0, v5
+; GFX8-NEXT:    s_add_i32 s0, s3, s5
+; GFX8-NEXT:    s_add_i32 s0, s0, s9
+; GFX8-NEXT:    s_add_i32 s0, s0, s7
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX8-NEXT:    s_mov_b32 s0, s8
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_mul_i128:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mul_i32 s9, s1, s4
+; GFX9-NEXT:    s_mul_i32 s10, s0, s5
+; GFX9-NEXT:    s_add_u32 s9, s9, s10
+; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s10, s10, 31
+; GFX9-NEXT:    s_mul_hi_u32 s11, s0, s4
+; GFX9-NEXT:    s_lshr_b32 s10, s10, 31
+; GFX9-NEXT:    s_add_u32 s9, s9, s11
+; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s11, s11, 31
+; GFX9-NEXT:    s_lshr_b32 s11, s11, 31
+; GFX9-NEXT:    s_add_i32 s10, s10, s11
+; GFX9-NEXT:    s_mul_i32 s11, s2, s4
+; GFX9-NEXT:    s_mul_i32 s12, s1, s5
+; GFX9-NEXT:    s_add_u32 s11, s11, s12
+; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s12, s12, 31
+; GFX9-NEXT:    s_mul_i32 s13, s0, s6
+; GFX9-NEXT:    s_lshr_b32 s12, s12, 31
+; GFX9-NEXT:    s_add_u32 s11, s11, s13
+; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s13, s13, 31
+; GFX9-NEXT:    s_lshr_b32 s13, s13, 31
+; GFX9-NEXT:    s_mul_hi_u32 s14, s1, s4
+; GFX9-NEXT:    s_add_i32 s12, s12, s13
+; GFX9-NEXT:    s_add_u32 s11, s11, s14
+; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s13, s13, 31
+; GFX9-NEXT:    s_lshr_b32 s13, s13, 31
+; GFX9-NEXT:    s_mul_hi_u32 s15, s0, s5
+; GFX9-NEXT:    s_add_i32 s12, s12, s13
+; GFX9-NEXT:    s_add_u32 s11, s11, s15
+; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s13, s13, 31
+; GFX9-NEXT:    s_lshr_b32 s13, s13, 31
+; GFX9-NEXT:    s_add_i32 s12, s12, s13
+; GFX9-NEXT:    s_add_u32 s10, s11, s10
+; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s11, s11, 31
+; GFX9-NEXT:    s_lshr_b32 s11, s11, 31
+; GFX9-NEXT:    s_add_i32 s12, s12, s11
+; GFX9-NEXT:    s_mul_i32 s11, s2, s5
+; GFX9-NEXT:    s_mul_i32 s3, s3, s4
+; GFX9-NEXT:    s_mul_i32 s13, s1, s6
+; GFX9-NEXT:    s_add_i32 s3, s3, s11
+; GFX9-NEXT:    s_mul_i32 s7, s0, s7
+; GFX9-NEXT:    s_add_i32 s3, s3, s13
+; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s4
+; GFX9-NEXT:    s_add_i32 s3, s3, s7
+; GFX9-NEXT:    s_mul_hi_u32 s1, s1, s5
+; GFX9-NEXT:    s_add_i32 s2, s3, s2
+; GFX9-NEXT:    s_mul_i32 s8, s0, s4
+; GFX9-NEXT:    s_add_i32 s1, s2, s1
+; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s6
+; GFX9-NEXT:    s_add_i32 s0, s1, s0
+; GFX9-NEXT:    s_add_i32 s3, s0, s12
+; GFX9-NEXT:    s_mov_b32 s0, s8
+; GFX9-NEXT:    s_mov_b32 s1, s9
+; GFX9-NEXT:    s_mov_b32 s2, s10
+; GFX9-NEXT:    ; return to shader part epilog
   %result = mul i128 %num, %den
   %cast = bitcast i128 %result to <4 x i32>
   ret <4 x i32> %cast
 }
 
 define i128 @v_mul_i128(i128 %num, i128 %den) {
+; GFX7-LABEL: v_mul_i128:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_lo_u32 v9, v1, v4
+; GFX7-NEXT:    v_mul_lo_u32 v10, v0, v5
+; GFX7-NEXT:    v_mul_hi_u32 v11, v0, v4
+; GFX7-NEXT:    v_mul_lo_u32 v12, v1, v5
+; GFX7-NEXT:    v_mul_lo_u32 v13, v0, v6
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GFX7-NEXT:    v_mul_lo_u32 v11, v2, v4
+; GFX7-NEXT:    v_mul_hi_u32 v14, v1, v4
+; GFX7-NEXT:    v_mul_hi_u32 v15, v0, v5
+; GFX7-NEXT:    v_mul_lo_u32 v3, v3, v4
+; GFX7-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GFX7-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GFX7-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GFX7-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GFX7-NEXT:    v_mul_lo_u32 v12, v2, v5
+; GFX7-NEXT:    v_mul_lo_u32 v13, v1, v6
+; GFX7-NEXT:    v_mul_lo_u32 v7, v0, v7
+; GFX7-NEXT:    v_mul_hi_u32 v2, v2, v4
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
+; GFX7-NEXT:    v_mul_hi_u32 v1, v1, v5
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
+; GFX7-NEXT:    v_mul_lo_u32 v8, v0, v4
+; GFX7-NEXT:    v_mul_hi_u32 v0, v0, v6
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v0, v11
+; GFX7-NEXT:    v_mov_b32_e32 v0, v8
+; GFX7-NEXT:    v_mov_b32_e32 v1, v9
+; GFX7-NEXT:    v_mov_b32_e32 v2, v10
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_mul_i128:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_lo_u32 v9, v1, v4
+; GFX8-NEXT:    v_mul_lo_u32 v10, v0, v5
+; GFX8-NEXT:    v_mul_hi_u32 v11, v0, v4
+; GFX8-NEXT:    v_mul_lo_u32 v12, v1, v5
+; GFX8-NEXT:    v_mul_lo_u32 v13, v0, v6
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v10
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v11
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
+; GFX8-NEXT:    v_mul_lo_u32 v11, v2, v4
+; GFX8-NEXT:    v_mul_hi_u32 v14, v1, v4
+; GFX8-NEXT:    v_mul_hi_u32 v15, v0, v5
+; GFX8-NEXT:    v_mul_lo_u32 v3, v3, v4
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v12
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v13
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v13
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v14
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v13
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v15
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v13
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v11, v10
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v12, v11
+; GFX8-NEXT:    v_mul_lo_u32 v12, v2, v5
+; GFX8-NEXT:    v_mul_lo_u32 v13, v1, v6
+; GFX8-NEXT:    v_mul_lo_u32 v7, v0, v7
+; GFX8-NEXT:    v_mul_hi_u32 v2, v2, v4
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v12
+; GFX8-NEXT:    v_mul_hi_u32 v1, v1, v5
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v13
+; GFX8-NEXT:    v_mul_lo_u32 v8, v0, v4
+; GFX8-NEXT:    v_mul_hi_u32 v0, v0, v6
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v0, v11
+; GFX8-NEXT:    v_mov_b32_e32 v0, v8
+; GFX8-NEXT:    v_mov_b32_e32 v1, v9
+; GFX8-NEXT:    v_mov_b32_e32 v2, v10
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_mul_i128:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_lo_u32 v9, v1, v4
+; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v5
+; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v4
+; GFX9-NEXT:    v_mul_lo_u32 v12, v1, v5
+; GFX9-NEXT:    v_mul_lo_u32 v13, v0, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v9, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v9, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v10, v10, v11
+; GFX9-NEXT:    v_mul_lo_u32 v11, v2, v4
+; GFX9-NEXT:    v_mul_hi_u32 v14, v1, v4
+; GFX9-NEXT:    v_mul_hi_u32 v15, v0, v5
+; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v11, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v11, v13
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v11, v14
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v11, v15
+; GFX9-NEXT:    v_add3_u32 v12, v12, v13, v14
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v11, v12, v13, v11
+; GFX9-NEXT:    v_mul_lo_u32 v12, v2, v5
+; GFX9-NEXT:    v_mul_lo_u32 v13, v1, v6
+; GFX9-NEXT:    v_mul_lo_u32 v7, v0, v7
+; GFX9-NEXT:    v_mul_hi_u32 v2, v2, v4
+; GFX9-NEXT:    v_mul_hi_u32 v1, v1, v5
+; GFX9-NEXT:    v_mul_lo_u32 v8, v0, v4
+; GFX9-NEXT:    v_mul_hi_u32 v0, v0, v6
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v12
+; GFX9-NEXT:    v_add3_u32 v3, v3, v13, v7
+; GFX9-NEXT:    v_add3_u32 v1, v3, v2, v1
+; GFX9-NEXT:    v_add3_u32 v3, v1, v0, v11
+; GFX9-NEXT:    v_mov_b32_e32 v0, v8
+; GFX9-NEXT:    v_mov_b32_e32 v1, v9
+; GFX9-NEXT:    v_mov_b32_e32 v2, v10
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i128 %num, %den
   ret i128 %result
 }
 
 define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
+; GFX7-LABEL: s_mul_i256:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX7-NEXT:    s_mul_i32 s17, s1, s8
+; GFX7-NEXT:    s_mul_i32 s18, s0, s9
+; GFX7-NEXT:    s_add_u32 s17, s17, s18
+; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX7-NEXT:    s_lshl_b32 s18, s18, 31
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s17, v0
+; GFX7-NEXT:    s_lshr_b32 s18, s18, 31
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, s18, v1
+; GFX7-NEXT:    s_mul_i32 s17, s2, s8
+; GFX7-NEXT:    s_mul_i32 s18, s1, s9
+; GFX7-NEXT:    s_add_u32 s17, s17, s18
+; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    s_lshl_b32 s18, s18, 31
+; GFX7-NEXT:    s_mul_i32 s19, s0, s10
+; GFX7-NEXT:    s_lshr_b32 s18, s18, 31
+; GFX7-NEXT:    v_mul_hi_u32 v2, v2, s8
+; GFX7-NEXT:    s_add_u32 s17, s17, s19
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s17, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s9
+; GFX7-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX7-NEXT:    v_mul_hi_u32 v4, s0, v3
+; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, s18, v5
+; GFX7-NEXT:    s_mul_i32 s17, s3, s8
+; GFX7-NEXT:    s_mul_i32 s18, s2, s9
+; GFX7-NEXT:    s_add_u32 s17, s17, s18
+; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX7-NEXT:    s_lshl_b32 s18, s18, 31
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX7-NEXT:    s_mul_i32 s19, s1, s10
+; GFX7-NEXT:    s_lshr_b32 s18, s18, 31
+; GFX7-NEXT:    s_add_u32 s17, s17, s19
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v4, s2
+; GFX7-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX7-NEXT:    v_mul_hi_u32 v5, v4, s8
+; GFX7-NEXT:    s_mul_i32 s20, s0, s11
+; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    s_add_u32 s17, s17, s20
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, s17, v5
+; GFX7-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, s18, v8
+; GFX7-NEXT:    s_mul_i32 s17, s4, s8
+; GFX7-NEXT:    s_mul_i32 s18, s3, s9
+; GFX7-NEXT:    v_mul_hi_u32 v3, s1, v3
+; GFX7-NEXT:    s_add_u32 s17, s17, s18
+; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX7-NEXT:    s_lshl_b32 s18, s18, 31
+; GFX7-NEXT:    v_mov_b32_e32 v6, s10
+; GFX7-NEXT:    v_mul_hi_u32 v7, s0, v6
+; GFX7-NEXT:    s_mul_i32 s19, s2, s10
+; GFX7-NEXT:    s_lshr_b32 s18, s18, 31
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GFX7-NEXT:    s_add_u32 s17, s17, s19
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GFX7-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GFX7-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX7-NEXT:    s_mul_i32 s20, s1, s11
+; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    s_add_u32 s17, s17, s20
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX7-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v5, s3
+; GFX7-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX7-NEXT:    s_mul_i32 s21, s0, s12
+; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    v_mul_hi_u32 v7, v5, s8
+; GFX7-NEXT:    s_add_u32 s17, s17, s21
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, s17, v7
+; GFX7-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v11, vcc, s18, v11
+; GFX7-NEXT:    s_mul_i32 s17, s5, s8
+; GFX7-NEXT:    s_mul_i32 s18, s4, s9
+; GFX7-NEXT:    s_add_u32 s17, s17, s18
+; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX7-NEXT:    v_mul_hi_u32 v4, v4, s9
+; GFX7-NEXT:    s_lshl_b32 s18, s18, 31
+; GFX7-NEXT:    s_mul_i32 s19, s3, s10
+; GFX7-NEXT:    s_lshr_b32 s18, s18, 31
+; GFX7-NEXT:    s_add_u32 s17, s17, s19
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    v_mul_hi_u32 v8, s1, v6
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GFX7-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v9, s11
+; GFX7-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
+; GFX7-NEXT:    v_mul_hi_u32 v10, s0, v9
+; GFX7-NEXT:    s_mul_i32 s20, s2, s11
+; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GFX7-NEXT:    s_add_u32 s17, s17, s20
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX7-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; GFX7-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX7-NEXT:    s_mul_i32 s21, s1, s12
+; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    s_add_u32 s17, s17, s21
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX7-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GFX7-NEXT:    v_mov_b32_e32 v7, s4
+; GFX7-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX7-NEXT:    s_mul_i32 s22, s0, s13
+; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    v_mul_hi_u32 v8, v7, s8
+; GFX7-NEXT:    s_add_u32 s17, s17, s22
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, s17, v8
+; GFX7-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v14, vcc, s18, v14
+; GFX7-NEXT:    s_mul_i32 s17, s6, s8
+; GFX7-NEXT:    s_mul_i32 s18, s5, s9
+; GFX7-NEXT:    s_add_u32 s17, s17, s18
+; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX7-NEXT:    s_lshl_b32 s18, s18, 31
+; GFX7-NEXT:    s_mul_i32 s19, s4, s10
+; GFX7-NEXT:    s_lshr_b32 s18, s18, 31
+; GFX7-NEXT:    v_mul_hi_u32 v10, v5, s9
+; GFX7-NEXT:    s_add_u32 s17, s17, s19
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX7-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX7-NEXT:    v_mul_hi_u32 v6, s2, v6
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GFX7-NEXT:    s_mul_i32 s20, s3, s11
+; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX7-NEXT:    s_add_u32 s17, s17, s20
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
+; GFX7-NEXT:    v_mul_hi_u32 v11, s1, v9
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GFX7-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v12, s12
+; GFX7-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GFX7-NEXT:    v_mul_hi_u32 v13, s0, v12
+; GFX7-NEXT:    s_mul_i32 s21, s2, s12
+; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
+; GFX7-NEXT:    s_add_u32 s17, s17, s21
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX7-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
+; GFX7-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX7-NEXT:    s_mul_i32 s22, s1, s13
+; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GFX7-NEXT:    s_add_u32 s17, s17, s22
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX7-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GFX7-NEXT:    v_mov_b32_e32 v8, s5
+; GFX7-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX7-NEXT:    v_mul_hi_u32 v10, v8, s8
+; GFX7-NEXT:    s_mul_i32 s23, s0, s14
+; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    s_add_u32 s17, s17, s23
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX7-NEXT:    v_mul_hi_u32 v11, v7, s9
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, s17, v10
+; GFX7-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v17, vcc, s18, v17
+; GFX7-NEXT:    v_mul_hi_u32 v5, v5, s10
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX7-NEXT:    v_mul_hi_u32 v13, s2, v9
+; GFX7-NEXT:    v_add_i32_e32 v11, vcc, v17, v11
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GFX7-NEXT:    v_mul_hi_u32 v14, s1, v12
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v15, s13
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GFX7-NEXT:    v_mul_hi_u32 v16, s0, v15
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v16
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GFX7-NEXT:    v_mov_b32_e32 v13, s14
+; GFX7-NEXT:    s_mul_i32 s7, s7, s8
+; GFX7-NEXT:    s_mul_i32 s17, s6, s9
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GFX7-NEXT:    s_mul_i32 s16, s0, s8
+; GFX7-NEXT:    s_mul_i32 s5, s5, s10
+; GFX7-NEXT:    s_mul_i32 s15, s0, s15
+; GFX7-NEXT:    v_mul_hi_u32 v13, s0, v13
+; GFX7-NEXT:    s_add_i32 s0, s7, s17
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX7-NEXT:    s_mul_i32 s4, s4, s11
+; GFX7-NEXT:    s_add_i32 s0, s0, s5
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
+; GFX7-NEXT:    v_mov_b32_e32 v10, s6
+; GFX7-NEXT:    s_mul_i32 s11, s3, s12
+; GFX7-NEXT:    s_add_i32 s0, s0, s4
+; GFX7-NEXT:    s_mul_i32 s12, s2, s13
+; GFX7-NEXT:    s_add_i32 s0, s0, s11
+; GFX7-NEXT:    v_mul_hi_u32 v10, v10, s8
+; GFX7-NEXT:    s_mul_i32 s13, s1, s14
+; GFX7-NEXT:    s_add_i32 s0, s0, s12
+; GFX7-NEXT:    v_mul_hi_u32 v8, v8, s9
+; GFX7-NEXT:    s_add_i32 s0, s0, s13
+; GFX7-NEXT:    v_mul_hi_u32 v7, v7, s10
+; GFX7-NEXT:    v_mul_hi_u32 v9, s3, v9
+; GFX7-NEXT:    s_add_i32 s0, s0, s15
+; GFX7-NEXT:    v_mul_hi_u32 v11, s2, v12
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, s0, v10
+; GFX7-NEXT:    v_mul_hi_u32 v12, s1, v15
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX7-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX7-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX7-NEXT:    v_readfirstlane_b32 s6, v5
+; GFX7-NEXT:    v_readfirstlane_b32 s7, v6
+; GFX7-NEXT:    s_mov_b32 s0, s16
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_mul_i256:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX8-NEXT:    s_mul_i32 s17, s1, s8
+; GFX8-NEXT:    s_mul_i32 s18, s0, s9
+; GFX8-NEXT:    s_add_u32 s17, s17, s18
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    s_lshl_b32 s18, s18, 31
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s17, v0
+; GFX8-NEXT:    s_lshr_b32 s18, s18, 31
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s18, v1
+; GFX8-NEXT:    s_mul_i32 s17, s2, s8
+; GFX8-NEXT:    s_mul_i32 s18, s1, s9
+; GFX8-NEXT:    s_add_u32 s17, s17, s18
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    s_lshl_b32 s18, s18, 31
+; GFX8-NEXT:    s_mul_i32 s19, s0, s10
+; GFX8-NEXT:    s_lshr_b32 s18, s18, 31
+; GFX8-NEXT:    v_mul_hi_u32 v2, v2, s8
+; GFX8-NEXT:    s_add_u32 s17, s17, s19
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s17, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX8-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX8-NEXT:    v_mul_hi_u32 v4, s0, v3
+; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s18, v5
+; GFX8-NEXT:    s_mul_i32 s17, s3, s8
+; GFX8-NEXT:    s_mul_i32 s18, s2, s9
+; GFX8-NEXT:    s_add_u32 s17, s17, s18
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-NEXT:    s_lshl_b32 s18, s18, 31
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    s_mul_i32 s19, s1, s10
+; GFX8-NEXT:    s_lshr_b32 s18, s18, 31
+; GFX8-NEXT:    s_add_u32 s17, s17, s19
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX8-NEXT:    v_mul_hi_u32 v5, v4, s8
+; GFX8-NEXT:    s_mul_i32 s20, s0, s11
+; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    s_add_u32 s17, s17, s20
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s17, v5
+; GFX8-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s18, v8
+; GFX8-NEXT:    s_mul_i32 s17, s4, s8
+; GFX8-NEXT:    s_mul_i32 s18, s3, s9
+; GFX8-NEXT:    v_mul_hi_u32 v3, s1, v3
+; GFX8-NEXT:    s_add_u32 s17, s17, s18
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    s_lshl_b32 s18, s18, 31
+; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:    v_mul_hi_u32 v7, s0, v6
+; GFX8-NEXT:    s_mul_i32 s19, s2, s10
+; GFX8-NEXT:    s_lshr_b32 s18, s18, 31
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
+; GFX8-NEXT:    s_add_u32 s17, s17, s19
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v8, v5
+; GFX8-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
+; GFX8-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT:    s_mul_i32 s20, s1, s11
+; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    s_add_u32 s17, s17, s20
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v7
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX8-NEXT:    s_mul_i32 s21, s0, s12
+; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    v_mul_hi_u32 v7, v5, s8
+; GFX8-NEXT:    s_add_u32 s17, s17, s21
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s17, v7
+; GFX8-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s18, v11
+; GFX8-NEXT:    s_mul_i32 s17, s5, s8
+; GFX8-NEXT:    s_mul_i32 s18, s4, s9
+; GFX8-NEXT:    s_add_u32 s17, s17, s18
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    v_mul_hi_u32 v4, v4, s9
+; GFX8-NEXT:    s_lshl_b32 s18, s18, 31
+; GFX8-NEXT:    s_mul_i32 s19, s3, s10
+; GFX8-NEXT:    s_lshr_b32 s18, s18, 31
+; GFX8-NEXT:    s_add_u32 s17, s17, s19
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    v_mul_hi_u32 v8, s1, v6
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v9, s11
+; GFX8-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v11, v7
+; GFX8-NEXT:    v_mul_hi_u32 v10, s0, v9
+; GFX8-NEXT:    s_mul_i32 s20, s2, s11
+; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
+; GFX8-NEXT:    s_add_u32 s17, s17, s20
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v10
+; GFX8-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    s_mul_i32 s21, s1, s12
+; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    s_add_u32 s17, s17, s21
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT:    v_mov_b32_e32 v7, s4
+; GFX8-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX8-NEXT:    s_mul_i32 s22, s0, s13
+; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    v_mul_hi_u32 v8, v7, s8
+; GFX8-NEXT:    s_add_u32 s17, s17, s22
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s17, v8
+; GFX8-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s18, v14
+; GFX8-NEXT:    s_mul_i32 s17, s6, s8
+; GFX8-NEXT:    s_mul_i32 s18, s5, s9
+; GFX8-NEXT:    s_add_u32 s17, s17, s18
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    s_lshl_b32 s18, s18, 31
+; GFX8-NEXT:    s_mul_i32 s19, s4, s10
+; GFX8-NEXT:    s_lshr_b32 s18, s18, 31
+; GFX8-NEXT:    v_mul_hi_u32 v10, v5, s9
+; GFX8-NEXT:    s_add_u32 s17, s17, s19
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX8-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX8-NEXT:    v_mul_hi_u32 v6, s2, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
+; GFX8-NEXT:    s_mul_i32 s20, s3, s11
+; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX8-NEXT:    s_add_u32 s17, s17, s20
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v14, v10
+; GFX8-NEXT:    v_mul_hi_u32 v11, s1, v9
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
+; GFX8-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v12, s12
+; GFX8-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v10, v8
+; GFX8-NEXT:    v_mul_hi_u32 v13, s0, v12
+; GFX8-NEXT:    s_mul_i32 s21, s2, s12
+; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v11
+; GFX8-NEXT:    s_add_u32 s17, s17, s21
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX8-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v13
+; GFX8-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX8-NEXT:    s_mul_i32 s22, s1, s13
+; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
+; GFX8-NEXT:    s_add_u32 s17, s17, s22
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
+; GFX8-NEXT:    v_mov_b32_e32 v8, s5
+; GFX8-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX8-NEXT:    v_mul_hi_u32 v10, v8, s8
+; GFX8-NEXT:    s_mul_i32 s23, s0, s14
+; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    s_add_u32 s17, s17, s23
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX8-NEXT:    v_mul_hi_u32 v11, v7, s9
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s17, v10
+; GFX8-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s18, v17
+; GFX8-NEXT:    v_mul_hi_u32 v5, v5, s10
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX8-NEXT:    v_mul_hi_u32 v13, s2, v9
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v17, v11
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v10, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v11, v10
+; GFX8-NEXT:    v_mul_hi_u32 v14, s1, v12
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v13
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v15, s13
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
+; GFX8-NEXT:    v_mul_hi_u32 v16, s0, v15
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v14
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v16
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
+; GFX8-NEXT:    v_mov_b32_e32 v13, s14
+; GFX8-NEXT:    s_mul_i32 s7, s7, s8
+; GFX8-NEXT:    s_mul_i32 s17, s6, s9
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v6
+; GFX8-NEXT:    s_mul_i32 s16, s0, s8
+; GFX8-NEXT:    s_mul_i32 s5, s5, s10
+; GFX8-NEXT:    s_mul_i32 s15, s0, s15
+; GFX8-NEXT:    v_mul_hi_u32 v13, s0, v13
+; GFX8-NEXT:    s_add_i32 s0, s7, s17
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    s_mul_i32 s4, s4, s11
+; GFX8-NEXT:    s_add_i32 s0, s0, s5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v10, v6
+; GFX8-NEXT:    v_mov_b32_e32 v10, s6
+; GFX8-NEXT:    s_mul_i32 s11, s3, s12
+; GFX8-NEXT:    s_add_i32 s0, s0, s4
+; GFX8-NEXT:    s_mul_i32 s12, s2, s13
+; GFX8-NEXT:    s_add_i32 s0, s0, s11
+; GFX8-NEXT:    v_mul_hi_u32 v10, v10, s8
+; GFX8-NEXT:    s_mul_i32 s13, s1, s14
+; GFX8-NEXT:    s_add_i32 s0, s0, s12
+; GFX8-NEXT:    v_mul_hi_u32 v8, v8, s9
+; GFX8-NEXT:    s_add_i32 s0, s0, s13
+; GFX8-NEXT:    v_mul_hi_u32 v7, v7, s10
+; GFX8-NEXT:    v_mul_hi_u32 v9, s3, v9
+; GFX8-NEXT:    s_add_i32 s0, s0, s15
+; GFX8-NEXT:    v_mul_hi_u32 v11, s2, v12
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s0, v10
+; GFX8-NEXT:    v_mul_hi_u32 v12, s1, v15
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v10, v8
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v11
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v12
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v13
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX8-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v5
+; GFX8-NEXT:    v_readfirstlane_b32 s7, v6
+; GFX8-NEXT:    s_mov_b32 s0, s16
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_mul_i256:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s16, s0
+; GFX9-NEXT:    s_mul_i32 s17, s1, s8
+; GFX9-NEXT:    s_mul_i32 s18, s16, s9
+; GFX9-NEXT:    s_add_u32 s17, s17, s18
+; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s18, s18, 31
+; GFX9-NEXT:    s_mul_hi_u32 s19, s16, s8
+; GFX9-NEXT:    s_lshr_b32 s18, s18, 31
+; GFX9-NEXT:    s_add_u32 s17, s17, s19
+; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX9-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX9-NEXT:    s_add_i32 s18, s18, s19
+; GFX9-NEXT:    s_mul_i32 s19, s2, s8
+; GFX9-NEXT:    s_mul_i32 s20, s1, s9
+; GFX9-NEXT:    s_add_u32 s19, s19, s20
+; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s20, s20, 31
+; GFX9-NEXT:    s_mul_i32 s21, s16, s10
+; GFX9-NEXT:    s_lshr_b32 s20, s20, 31
+; GFX9-NEXT:    s_add_u32 s19, s19, s21
+; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s21, s21, 31
+; GFX9-NEXT:    s_lshr_b32 s21, s21, 31
+; GFX9-NEXT:    s_mul_hi_u32 s22, s1, s8
+; GFX9-NEXT:    s_add_i32 s20, s20, s21
+; GFX9-NEXT:    s_add_u32 s19, s19, s22
+; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s21, s21, 31
+; GFX9-NEXT:    s_lshr_b32 s21, s21, 31
+; GFX9-NEXT:    s_mul_hi_u32 s23, s16, s9
+; GFX9-NEXT:    s_add_i32 s20, s20, s21
+; GFX9-NEXT:    s_add_u32 s19, s19, s23
+; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s21, s21, 31
+; GFX9-NEXT:    s_lshr_b32 s21, s21, 31
+; GFX9-NEXT:    s_add_i32 s20, s20, s21
+; GFX9-NEXT:    s_add_u32 s18, s19, s18
+; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s19, s19, 31
+; GFX9-NEXT:    s_lshr_b32 s19, s19, 31
+; GFX9-NEXT:    s_add_i32 s20, s20, s19
+; GFX9-NEXT:    s_mul_i32 s19, s3, s8
+; GFX9-NEXT:    s_mul_i32 s21, s2, s9
+; GFX9-NEXT:    s_add_u32 s19, s19, s21
+; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s21, s21, 31
+; GFX9-NEXT:    s_mul_i32 s22, s1, s10
+; GFX9-NEXT:    s_lshr_b32 s21, s21, 31
+; GFX9-NEXT:    s_add_u32 s19, s19, s22
+; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s22, s22, 31
+; GFX9-NEXT:    s_lshr_b32 s22, s22, 31
+; GFX9-NEXT:    s_mul_i32 s23, s16, s11
+; GFX9-NEXT:    s_add_i32 s21, s21, s22
+; GFX9-NEXT:    s_add_u32 s19, s19, s23
+; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s22, s22, 31
+; GFX9-NEXT:    s_lshr_b32 s22, s22, 31
+; GFX9-NEXT:    s_mul_hi_u32 s24, s2, s8
+; GFX9-NEXT:    s_add_i32 s21, s21, s22
+; GFX9-NEXT:    s_add_u32 s19, s19, s24
+; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s22, s22, 31
+; GFX9-NEXT:    s_lshr_b32 s22, s22, 31
+; GFX9-NEXT:    s_mul_hi_u32 s25, s1, s9
+; GFX9-NEXT:    s_add_i32 s21, s21, s22
+; GFX9-NEXT:    s_add_u32 s19, s19, s25
+; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s22, s22, 31
+; GFX9-NEXT:    s_lshr_b32 s22, s22, 31
+; GFX9-NEXT:    s_mul_hi_u32 s26, s16, s10
+; GFX9-NEXT:    s_add_i32 s21, s21, s22
+; GFX9-NEXT:    s_add_u32 s19, s19, s26
+; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s22, s22, 31
+; GFX9-NEXT:    s_lshr_b32 s22, s22, 31
+; GFX9-NEXT:    s_add_i32 s21, s21, s22
+; GFX9-NEXT:    s_add_u32 s19, s19, s20
+; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s20, s20, 31
+; GFX9-NEXT:    s_lshr_b32 s20, s20, 31
+; GFX9-NEXT:    s_add_i32 s21, s21, s20
+; GFX9-NEXT:    s_mul_i32 s20, s4, s8
+; GFX9-NEXT:    s_mul_i32 s22, s3, s9
+; GFX9-NEXT:    s_add_u32 s20, s20, s22
+; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s22, s22, 31
+; GFX9-NEXT:    s_mul_i32 s23, s2, s10
+; GFX9-NEXT:    s_lshr_b32 s22, s22, 31
+; GFX9-NEXT:    s_add_u32 s20, s20, s23
+; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s23, s23, 31
+; GFX9-NEXT:    s_lshr_b32 s23, s23, 31
+; GFX9-NEXT:    s_mul_i32 s24, s1, s11
+; GFX9-NEXT:    s_add_i32 s22, s22, s23
+; GFX9-NEXT:    s_add_u32 s20, s20, s24
+; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s23, s23, 31
+; GFX9-NEXT:    s_lshr_b32 s23, s23, 31
+; GFX9-NEXT:    s_mul_i32 s25, s16, s12
+; GFX9-NEXT:    s_add_i32 s22, s22, s23
+; GFX9-NEXT:    s_add_u32 s20, s20, s25
+; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s23, s23, 31
+; GFX9-NEXT:    s_lshr_b32 s23, s23, 31
+; GFX9-NEXT:    s_mul_hi_u32 s26, s3, s8
+; GFX9-NEXT:    s_add_i32 s22, s22, s23
+; GFX9-NEXT:    s_add_u32 s20, s20, s26
+; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s23, s23, 31
+; GFX9-NEXT:    s_lshr_b32 s23, s23, 31
+; GFX9-NEXT:    s_mul_hi_u32 s27, s2, s9
+; GFX9-NEXT:    s_add_i32 s22, s22, s23
+; GFX9-NEXT:    s_add_u32 s20, s20, s27
+; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s23, s23, 31
+; GFX9-NEXT:    s_lshr_b32 s23, s23, 31
+; GFX9-NEXT:    s_mul_hi_u32 s28, s1, s10
+; GFX9-NEXT:    s_add_i32 s22, s22, s23
+; GFX9-NEXT:    s_add_u32 s20, s20, s28
+; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s23, s23, 31
+; GFX9-NEXT:    s_lshr_b32 s23, s23, 31
+; GFX9-NEXT:    s_mul_hi_u32 s29, s16, s11
+; GFX9-NEXT:    s_add_i32 s22, s22, s23
+; GFX9-NEXT:    s_add_u32 s20, s20, s29
+; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s23, s23, 31
+; GFX9-NEXT:    s_lshr_b32 s23, s23, 31
+; GFX9-NEXT:    s_add_i32 s22, s22, s23
+; GFX9-NEXT:    s_add_u32 s20, s20, s21
+; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s21, s21, 31
+; GFX9-NEXT:    s_lshr_b32 s21, s21, 31
+; GFX9-NEXT:    s_add_i32 s22, s22, s21
+; GFX9-NEXT:    s_mul_i32 s21, s5, s8
+; GFX9-NEXT:    s_mul_i32 s23, s4, s9
+; GFX9-NEXT:    s_add_u32 s21, s21, s23
+; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s23, s23, 31
+; GFX9-NEXT:    s_mul_i32 s24, s3, s10
+; GFX9-NEXT:    s_lshr_b32 s23, s23, 31
+; GFX9-NEXT:    s_add_u32 s21, s21, s24
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s24, s24, 31
+; GFX9-NEXT:    s_lshr_b32 s24, s24, 31
+; GFX9-NEXT:    s_mul_i32 s25, s2, s11
+; GFX9-NEXT:    s_add_i32 s23, s23, s24
+; GFX9-NEXT:    s_add_u32 s21, s21, s25
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s24, s24, 31
+; GFX9-NEXT:    s_lshr_b32 s24, s24, 31
+; GFX9-NEXT:    s_mul_i32 s26, s1, s12
+; GFX9-NEXT:    s_add_i32 s23, s23, s24
+; GFX9-NEXT:    s_add_u32 s21, s21, s26
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s24, s24, 31
+; GFX9-NEXT:    s_lshr_b32 s24, s24, 31
+; GFX9-NEXT:    s_mul_i32 s27, s16, s13
+; GFX9-NEXT:    s_add_i32 s23, s23, s24
+; GFX9-NEXT:    s_add_u32 s21, s21, s27
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s24, s24, 31
+; GFX9-NEXT:    s_lshr_b32 s24, s24, 31
+; GFX9-NEXT:    s_mul_hi_u32 s28, s4, s8
+; GFX9-NEXT:    s_add_i32 s23, s23, s24
+; GFX9-NEXT:    s_add_u32 s21, s21, s28
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s24, s24, 31
+; GFX9-NEXT:    s_lshr_b32 s24, s24, 31
+; GFX9-NEXT:    s_mul_hi_u32 s29, s3, s9
+; GFX9-NEXT:    s_add_i32 s23, s23, s24
+; GFX9-NEXT:    s_add_u32 s21, s21, s29
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s24, s24, 31
+; GFX9-NEXT:    s_lshr_b32 s24, s24, 31
+; GFX9-NEXT:    s_mul_hi_u32 s30, s2, s10
+; GFX9-NEXT:    s_add_i32 s23, s23, s24
+; GFX9-NEXT:    s_add_u32 s21, s21, s30
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s24, s24, 31
+; GFX9-NEXT:    s_lshr_b32 s24, s24, 31
+; GFX9-NEXT:    s_mul_hi_u32 s31, s1, s11
+; GFX9-NEXT:    s_add_i32 s23, s23, s24
+; GFX9-NEXT:    s_add_u32 s21, s21, s31
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s24, s24, 31
+; GFX9-NEXT:    s_lshr_b32 s24, s24, 31
+; GFX9-NEXT:    s_mul_hi_u32 s32, s16, s12
+; GFX9-NEXT:    s_add_i32 s23, s23, s24
+; GFX9-NEXT:    s_add_u32 s21, s21, s32
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s24, s24, 31
+; GFX9-NEXT:    s_lshr_b32 s24, s24, 31
+; GFX9-NEXT:    s_add_i32 s23, s23, s24
+; GFX9-NEXT:    s_add_u32 s21, s21, s22
+; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s22, s22, 31
+; GFX9-NEXT:    s_lshr_b32 s22, s22, 31
+; GFX9-NEXT:    s_add_i32 s23, s23, s22
+; GFX9-NEXT:    s_mul_i32 s22, s6, s8
+; GFX9-NEXT:    s_mul_i32 s24, s5, s9
+; GFX9-NEXT:    s_add_u32 s22, s22, s24
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s24, s24, 31
+; GFX9-NEXT:    s_mul_i32 s25, s4, s10
+; GFX9-NEXT:    s_lshr_b32 s24, s24, 31
+; GFX9-NEXT:    s_add_u32 s22, s22, s25
+; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s25, s25, 31
+; GFX9-NEXT:    s_lshr_b32 s25, s25, 31
+; GFX9-NEXT:    s_mul_i32 s26, s3, s11
+; GFX9-NEXT:    s_add_i32 s24, s24, s25
+; GFX9-NEXT:    s_add_u32 s22, s22, s26
+; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s25, s25, 31
+; GFX9-NEXT:    s_lshr_b32 s25, s25, 31
+; GFX9-NEXT:    s_mul_i32 s27, s2, s12
+; GFX9-NEXT:    s_add_i32 s24, s24, s25
+; GFX9-NEXT:    s_add_u32 s22, s22, s27
+; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s25, s25, 31
+; GFX9-NEXT:    s_lshr_b32 s25, s25, 31
+; GFX9-NEXT:    s_mul_i32 s28, s1, s13
+; GFX9-NEXT:    s_add_i32 s24, s24, s25
+; GFX9-NEXT:    s_add_u32 s22, s22, s28
+; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s25, s25, 31
+; GFX9-NEXT:    s_lshr_b32 s25, s25, 31
+; GFX9-NEXT:    s_mul_i32 s29, s16, s14
+; GFX9-NEXT:    s_add_i32 s24, s24, s25
+; GFX9-NEXT:    s_add_u32 s22, s22, s29
+; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s25, s25, 31
+; GFX9-NEXT:    s_lshr_b32 s25, s25, 31
+; GFX9-NEXT:    s_mul_hi_u32 s30, s5, s8
+; GFX9-NEXT:    s_add_i32 s24, s24, s25
+; GFX9-NEXT:    s_add_u32 s22, s22, s30
+; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s25, s25, 31
+; GFX9-NEXT:    s_lshr_b32 s25, s25, 31
+; GFX9-NEXT:    s_mul_hi_u32 s31, s4, s9
+; GFX9-NEXT:    s_add_i32 s24, s24, s25
+; GFX9-NEXT:    s_add_u32 s22, s22, s31
+; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s25, s25, 31
+; GFX9-NEXT:    s_lshr_b32 s25, s25, 31
+; GFX9-NEXT:    s_mul_hi_u32 s32, s3, s10
+; GFX9-NEXT:    s_add_i32 s24, s24, s25
+; GFX9-NEXT:    s_add_u32 s22, s22, s32
+; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s25, s25, 31
+; GFX9-NEXT:    s_lshr_b32 s25, s25, 31
+; GFX9-NEXT:    s_mul_hi_u32 s33, s2, s11
+; GFX9-NEXT:    s_add_i32 s24, s24, s25
+; GFX9-NEXT:    s_add_u32 s22, s22, s33
+; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s25, s25, 31
+; GFX9-NEXT:    s_lshr_b32 s25, s25, 31
+; GFX9-NEXT:    s_mul_hi_u32 s34, s1, s12
+; GFX9-NEXT:    s_add_i32 s24, s24, s25
+; GFX9-NEXT:    s_add_u32 s22, s22, s34
+; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s25, s25, 31
+; GFX9-NEXT:    s_lshr_b32 s25, s25, 31
+; GFX9-NEXT:    s_mul_hi_u32 s35, s16, s13
+; GFX9-NEXT:    s_add_i32 s24, s24, s25
+; GFX9-NEXT:    s_add_u32 s22, s22, s35
+; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s25, s25, 31
+; GFX9-NEXT:    s_lshr_b32 s25, s25, 31
+; GFX9-NEXT:    s_add_i32 s24, s24, s25
+; GFX9-NEXT:    s_add_u32 s22, s22, s23
+; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s23, s23, 31
+; GFX9-NEXT:    s_lshr_b32 s23, s23, 31
+; GFX9-NEXT:    s_add_i32 s24, s24, s23
+; GFX9-NEXT:    s_mul_i32 s23, s6, s9
+; GFX9-NEXT:    s_mul_i32 s7, s7, s8
+; GFX9-NEXT:    s_mul_i32 s25, s5, s10
+; GFX9-NEXT:    s_add_i32 s7, s7, s23
+; GFX9-NEXT:    s_mul_i32 s26, s4, s11
+; GFX9-NEXT:    s_add_i32 s7, s7, s25
+; GFX9-NEXT:    s_mul_i32 s27, s3, s12
+; GFX9-NEXT:    s_add_i32 s7, s7, s26
+; GFX9-NEXT:    s_mul_i32 s28, s2, s13
+; GFX9-NEXT:    s_add_i32 s7, s7, s27
+; GFX9-NEXT:    s_mul_i32 s29, s1, s14
+; GFX9-NEXT:    s_add_i32 s7, s7, s28
+; GFX9-NEXT:    s_mul_i32 s15, s16, s15
+; GFX9-NEXT:    s_add_i32 s7, s7, s29
+; GFX9-NEXT:    s_mul_hi_u32 s6, s6, s8
+; GFX9-NEXT:    s_add_i32 s7, s7, s15
+; GFX9-NEXT:    s_mul_hi_u32 s5, s5, s9
+; GFX9-NEXT:    s_add_i32 s6, s7, s6
+; GFX9-NEXT:    s_add_i32 s5, s6, s5
+; GFX9-NEXT:    s_mul_hi_u32 s4, s4, s10
+; GFX9-NEXT:    s_add_i32 s4, s5, s4
+; GFX9-NEXT:    s_mul_hi_u32 s3, s3, s11
+; GFX9-NEXT:    s_add_i32 s3, s4, s3
+; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s12
+; GFX9-NEXT:    s_add_i32 s2, s3, s2
+; GFX9-NEXT:    s_mul_hi_u32 s1, s1, s13
+; GFX9-NEXT:    s_mul_i32 s0, s0, s8
+; GFX9-NEXT:    s_add_i32 s1, s2, s1
+; GFX9-NEXT:    s_mul_hi_u32 s8, s16, s14
+; GFX9-NEXT:    s_add_i32 s1, s1, s8
+; GFX9-NEXT:    s_add_i32 s7, s1, s24
+; GFX9-NEXT:    s_mov_b32 s1, s17
+; GFX9-NEXT:    s_mov_b32 s2, s18
+; GFX9-NEXT:    s_mov_b32 s3, s19
+; GFX9-NEXT:    s_mov_b32 s4, s20
+; GFX9-NEXT:    s_mov_b32 s5, s21
+; GFX9-NEXT:    s_mov_b32 s6, s22
+; GFX9-NEXT:    ; return to shader part epilog
   %result = mul i256 %num, %den
   %cast = bitcast i256 %result to <8 x i32>
   ret <8 x i32> %cast
 }
 
 define i256 @v_mul_i256(i256 %num, i256 %den) {
+; GFX7-LABEL: v_mul_i256:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_lo_u32 v16, v1, v8
+; GFX7-NEXT:    v_mul_lo_u32 v17, v0, v9
+; GFX7-NEXT:    v_mul_hi_u32 v18, v0, v8
+; GFX7-NEXT:    v_mul_lo_u32 v19, v2, v8
+; GFX7-NEXT:    v_mul_lo_u32 v20, v1, v9
+; GFX7-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
+; GFX7-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v16, vcc, v16, v18
+; GFX7-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
+; GFX7-NEXT:    v_mul_lo_u32 v18, v0, v10
+; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v19, v20
+; GFX7-NEXT:    v_mul_hi_u32 v21, v1, v8
+; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
+; GFX7-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v20, v19
+; GFX7-NEXT:    v_add_i32_e32 v18, vcc, v18, v21
+; GFX7-NEXT:    v_mul_hi_u32 v21, v0, v9
+; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v19, v20
+; GFX7-NEXT:    v_mul_lo_u32 v22, v0, v11
+; GFX7-NEXT:    v_add_i32_e32 v18, vcc, v18, v21
+; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v19, v20
+; GFX7-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
+; GFX7-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GFX7-NEXT:    v_mul_lo_u32 v20, v3, v8
+; GFX7-NEXT:    v_mul_lo_u32 v21, v2, v9
+; GFX7-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
+; GFX7-NEXT:    v_mul_lo_u32 v19, v1, v10
+; GFX7-NEXT:    v_mul_lo_u32 v23, v1, v11
+; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v21
+; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v20, v19
+; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v21, v20
+; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v19, v22
+; GFX7-NEXT:    v_mul_hi_u32 v22, v2, v8
+; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v21
+; GFX7-NEXT:    v_mul_lo_u32 v7, v7, v8
+; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v19, v22
+; GFX7-NEXT:    v_mul_hi_u32 v22, v1, v9
+; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v21
+; GFX7-NEXT:    v_mul_lo_u32 v15, v0, v15
+; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v19, v22
+; GFX7-NEXT:    v_mul_hi_u32 v22, v0, v10
+; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v21
+; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v19, v22
+; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v21
+; GFX7-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
+; GFX7-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GFX7-NEXT:    v_mul_lo_u32 v21, v4, v8
+; GFX7-NEXT:    v_mul_lo_u32 v22, v3, v9
+; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v20, v19
+; GFX7-NEXT:    v_mul_lo_u32 v20, v2, v10
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v22
+; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v21, v20
+; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v22, v21
+; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v23
+; GFX7-NEXT:    v_mul_lo_u32 v23, v0, v12
+; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v22
+; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v23
+; GFX7-NEXT:    v_mul_hi_u32 v23, v3, v8
+; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v22
+; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v23
+; GFX7-NEXT:    v_mul_hi_u32 v23, v2, v9
+; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v22
+; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v23
+; GFX7-NEXT:    v_mul_hi_u32 v23, v1, v10
+; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v22
+; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v23
+; GFX7-NEXT:    v_mul_hi_u32 v23, v0, v11
+; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v22
+; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v23
+; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v22
+; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v20, v19
+; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GFX7-NEXT:    v_mul_lo_u32 v22, v5, v8
+; GFX7-NEXT:    v_mul_lo_u32 v23, v4, v9
+; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v21, v20
+; GFX7-NEXT:    v_mul_lo_u32 v21, v3, v10
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
+; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v22, v21
+; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v23, v22
+; GFX7-NEXT:    v_mul_lo_u32 v23, v2, v11
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v23
+; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
+; GFX7-NEXT:    v_mul_lo_u32 v23, v1, v12
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v23
+; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
+; GFX7-NEXT:    v_mul_lo_u32 v23, v0, v13
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v23
+; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
+; GFX7-NEXT:    v_mul_hi_u32 v23, v4, v8
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v23
+; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
+; GFX7-NEXT:    v_mul_hi_u32 v23, v3, v9
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v23
+; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
+; GFX7-NEXT:    v_mul_hi_u32 v23, v2, v10
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v23
+; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
+; GFX7-NEXT:    v_mul_hi_u32 v23, v1, v11
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v23
+; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
+; GFX7-NEXT:    v_mul_hi_u32 v23, v0, v12
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v23
+; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
+; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v21, v20
+; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v22, v21
+; GFX7-NEXT:    v_mul_lo_u32 v22, v6, v8
+; GFX7-NEXT:    v_mul_lo_u32 v23, v5, v9
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
+; GFX7-NEXT:    v_mul_lo_u32 v23, v4, v10
+; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
+; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v24, v23
+; GFX7-NEXT:    v_mul_lo_u32 v24, v3, v11
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
+; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
+; GFX7-NEXT:    v_mul_lo_u32 v24, v2, v12
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
+; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
+; GFX7-NEXT:    v_mul_lo_u32 v24, v1, v13
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
+; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
+; GFX7-NEXT:    v_mul_lo_u32 v24, v0, v14
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
+; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
+; GFX7-NEXT:    v_mul_hi_u32 v24, v5, v8
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
+; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
+; GFX7-NEXT:    v_mul_hi_u32 v24, v4, v9
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
+; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
+; GFX7-NEXT:    v_mul_hi_u32 v24, v3, v10
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
+; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
+; GFX7-NEXT:    v_mul_hi_u32 v24, v2, v11
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
+; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
+; GFX7-NEXT:    v_mul_hi_u32 v24, v1, v12
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
+; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
+; GFX7-NEXT:    v_mul_hi_u32 v24, v0, v13
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
+; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v22, v21
+; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v22
+; GFX7-NEXT:    v_mul_lo_u32 v22, v0, v8
+; GFX7-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GFX7-NEXT:    v_mul_lo_u32 v6, v6, v9
+; GFX7-NEXT:    v_mul_hi_u32 v9, v5, v9
+; GFX7-NEXT:    v_mul_lo_u32 v5, v5, v10
+; GFX7-NEXT:    v_mul_hi_u32 v10, v4, v10
+; GFX7-NEXT:    v_mul_lo_u32 v4, v4, v11
+; GFX7-NEXT:    v_mul_hi_u32 v11, v3, v11
+; GFX7-NEXT:    v_mul_lo_u32 v3, v3, v12
+; GFX7-NEXT:    v_mul_hi_u32 v12, v2, v12
+; GFX7-NEXT:    v_mul_lo_u32 v2, v2, v13
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GFX7-NEXT:    v_mul_hi_u32 v13, v1, v13
+; GFX7-NEXT:    v_mul_lo_u32 v1, v1, v14
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
+; GFX7-NEXT:    v_mul_hi_u32 v0, v0, v14
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v0, v23
+; GFX7-NEXT:    v_mov_b32_e32 v0, v22
+; GFX7-NEXT:    v_mov_b32_e32 v1, v16
+; GFX7-NEXT:    v_mov_b32_e32 v2, v17
+; GFX7-NEXT:    v_mov_b32_e32 v3, v18
+; GFX7-NEXT:    v_mov_b32_e32 v4, v19
+; GFX7-NEXT:    v_mov_b32_e32 v5, v20
+; GFX7-NEXT:    v_mov_b32_e32 v6, v21
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_mul_i256:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_lo_u32 v16, v1, v8
+; GFX8-NEXT:    v_mul_lo_u32 v17, v0, v9
+; GFX8-NEXT:    v_mul_hi_u32 v18, v0, v8
+; GFX8-NEXT:    v_mul_lo_u32 v19, v2, v8
+; GFX8-NEXT:    v_mul_lo_u32 v20, v1, v9
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v17
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v18
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v18
+; GFX8-NEXT:    v_mul_lo_u32 v18, v0, v10
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v20
+; GFX8-NEXT:    v_mul_hi_u32 v21, v1, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v19, v18
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v20, v19
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v21
+; GFX8-NEXT:    v_mul_hi_u32 v21, v0, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v20
+; GFX8-NEXT:    v_mul_lo_u32 v22, v0, v11
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v21
+; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v20
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v18, v17
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v20, v3, v8
+; GFX8-NEXT:    v_mul_lo_u32 v21, v2, v9
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v19, v18
+; GFX8-NEXT:    v_mul_lo_u32 v19, v1, v10
+; GFX8-NEXT:    v_mul_lo_u32 v23, v1, v11
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v21
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v20, v19
+; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v21, v20
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v22
+; GFX8-NEXT:    v_mul_hi_u32 v22, v2, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v21
+; GFX8-NEXT:    v_mul_lo_u32 v7, v7, v8
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v22
+; GFX8-NEXT:    v_mul_hi_u32 v22, v1, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v21
+; GFX8-NEXT:    v_mul_lo_u32 v15, v0, v15
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v22
+; GFX8-NEXT:    v_mul_hi_u32 v22, v0, v10
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v21
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v22
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v21
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v19, v18
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v21, v4, v8
+; GFX8-NEXT:    v_mul_lo_u32 v22, v3, v9
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v20, v19
+; GFX8-NEXT:    v_mul_lo_u32 v20, v2, v10
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v22
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v21, v20
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v22, v21
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v23
+; GFX8-NEXT:    v_mul_lo_u32 v23, v0, v12
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v22
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v23
+; GFX8-NEXT:    v_mul_hi_u32 v23, v3, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v22
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v23
+; GFX8-NEXT:    v_mul_hi_u32 v23, v2, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v22
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v23
+; GFX8-NEXT:    v_mul_hi_u32 v23, v1, v10
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v22
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v23
+; GFX8-NEXT:    v_mul_hi_u32 v23, v0, v11
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v22
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v23
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v22
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v20, v19
+; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v22, v5, v8
+; GFX8-NEXT:    v_mul_lo_u32 v23, v4, v9
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v21, v20
+; GFX8-NEXT:    v_mul_lo_u32 v21, v3, v10
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v22, v21
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v23, v22
+; GFX8-NEXT:    v_mul_lo_u32 v23, v2, v11
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v23
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
+; GFX8-NEXT:    v_mul_lo_u32 v23, v1, v12
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v23
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
+; GFX8-NEXT:    v_mul_lo_u32 v23, v0, v13
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v23
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
+; GFX8-NEXT:    v_mul_hi_u32 v23, v4, v8
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v23
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
+; GFX8-NEXT:    v_mul_hi_u32 v23, v3, v9
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v23
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
+; GFX8-NEXT:    v_mul_hi_u32 v23, v2, v10
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v23
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
+; GFX8-NEXT:    v_mul_hi_u32 v23, v1, v11
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v23
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
+; GFX8-NEXT:    v_mul_hi_u32 v23, v0, v12
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v23
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v21, v20
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v22, v21
+; GFX8-NEXT:    v_mul_lo_u32 v22, v6, v8
+; GFX8-NEXT:    v_mul_lo_u32 v23, v5, v9
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
+; GFX8-NEXT:    v_mul_lo_u32 v23, v4, v10
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v24, v23
+; GFX8-NEXT:    v_mul_lo_u32 v24, v3, v11
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
+; GFX8-NEXT:    v_mul_lo_u32 v24, v2, v12
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
+; GFX8-NEXT:    v_mul_lo_u32 v24, v1, v13
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
+; GFX8-NEXT:    v_mul_lo_u32 v24, v0, v14
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
+; GFX8-NEXT:    v_mul_hi_u32 v24, v5, v8
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
+; GFX8-NEXT:    v_mul_hi_u32 v24, v4, v9
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
+; GFX8-NEXT:    v_mul_hi_u32 v24, v3, v10
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
+; GFX8-NEXT:    v_mul_hi_u32 v24, v2, v11
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
+; GFX8-NEXT:    v_mul_hi_u32 v24, v1, v12
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
+; GFX8-NEXT:    v_mul_hi_u32 v24, v0, v13
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v22, v21
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v22
+; GFX8-NEXT:    v_mul_lo_u32 v22, v0, v8
+; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GFX8-NEXT:    v_mul_lo_u32 v6, v6, v9
+; GFX8-NEXT:    v_mul_hi_u32 v9, v5, v9
+; GFX8-NEXT:    v_mul_lo_u32 v5, v5, v10
+; GFX8-NEXT:    v_mul_hi_u32 v10, v4, v10
+; GFX8-NEXT:    v_mul_lo_u32 v4, v4, v11
+; GFX8-NEXT:    v_mul_hi_u32 v11, v3, v11
+; GFX8-NEXT:    v_mul_lo_u32 v3, v3, v12
+; GFX8-NEXT:    v_mul_hi_u32 v12, v2, v12
+; GFX8-NEXT:    v_mul_lo_u32 v2, v2, v13
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
+; GFX8-NEXT:    v_mul_hi_u32 v13, v1, v13
+; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v14
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v15
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v8
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v9
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v10
+; GFX8-NEXT:    v_mul_hi_u32 v0, v0, v14
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v11
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v12
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v13
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v0, v23
+; GFX8-NEXT:    v_mov_b32_e32 v0, v22
+; GFX8-NEXT:    v_mov_b32_e32 v1, v16
+; GFX8-NEXT:    v_mov_b32_e32 v2, v17
+; GFX8-NEXT:    v_mov_b32_e32 v3, v18
+; GFX8-NEXT:    v_mov_b32_e32 v4, v19
+; GFX8-NEXT:    v_mov_b32_e32 v5, v20
+; GFX8-NEXT:    v_mov_b32_e32 v6, v21
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_mul_i256:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_lo_u32 v16, v2, v8
+; GFX9-NEXT:    v_mul_lo_u32 v17, v1, v9
+; GFX9-NEXT:    v_mul_lo_u32 v18, v0, v10
+; GFX9-NEXT:    v_mul_hi_u32 v19, v1, v8
+; GFX9-NEXT:    v_mul_lo_u32 v20, v1, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v16, vcc, v16, v17
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v16, vcc, v16, v18
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v19, vcc, v16, v19
+; GFX9-NEXT:    v_mul_lo_u32 v21, v0, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v18, v17, v18, v16
+; GFX9-NEXT:    v_mul_hi_u32 v16, v0, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v17, vcc, v20, v21
+; GFX9-NEXT:    v_mul_hi_u32 v21, v0, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v16, vcc, v17, v16
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v19, vcc, v19, v21
+; GFX9-NEXT:    v_add_u32_e32 v17, v20, v17
+; GFX9-NEXT:    v_mul_lo_u32 v21, v3, v8
+; GFX9-NEXT:    v_mul_lo_u32 v22, v2, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v17, vcc, v19, v17
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v18, v18, v20, v19
+; GFX9-NEXT:    v_mul_lo_u32 v19, v1, v10
+; GFX9-NEXT:    v_add_co_u32_e32 v20, vcc, v21, v22
+; GFX9-NEXT:    v_mul_lo_u32 v22, v0, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v19, vcc, v20, v19
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v19, vcc, v19, v22
+; GFX9-NEXT:    v_mul_hi_u32 v23, v2, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v20, v21, v20, v22
+; GFX9-NEXT:    v_mul_hi_u32 v21, v1, v9
+; GFX9-NEXT:    v_add_co_u32_e32 v19, vcc, v19, v23
+; GFX9-NEXT:    v_mul_hi_u32 v23, v0, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v19, vcc, v19, v21
+; GFX9-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v20, v20, v22, v21
+; GFX9-NEXT:    v_add_co_u32_e32 v19, vcc, v19, v23
+; GFX9-NEXT:    v_mul_lo_u32 v22, v4, v8
+; GFX9-NEXT:    v_mul_lo_u32 v23, v3, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v18, vcc, v19, v18
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v19, v20, v21, v19
+; GFX9-NEXT:    v_mul_lo_u32 v20, v2, v10
+; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v22, v23
+; GFX9-NEXT:    v_mul_lo_u32 v23, v1, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v20, vcc, v21, v20
+; GFX9-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v20, vcc, v20, v23
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v21, v22, v21, v23
+; GFX9-NEXT:    v_mul_lo_u32 v22, v0, v12
+; GFX9-NEXT:    v_mul_hi_u32 v23, v3, v8
+; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v20, vcc, v20, v22
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v20, vcc, v20, v23
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v21, v21, v22, v23
+; GFX9-NEXT:    v_mul_hi_u32 v22, v2, v9
+; GFX9-NEXT:    v_mul_hi_u32 v23, v1, v10
+; GFX9-NEXT:    v_add_co_u32_e32 v20, vcc, v20, v22
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v20, vcc, v20, v23
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v21, v21, v22, v23
+; GFX9-NEXT:    v_mul_hi_u32 v22, v0, v11
+; GFX9-NEXT:    v_mul_lo_u32 v23, v3, v10
+; GFX9-NEXT:    v_add_co_u32_e32 v20, vcc, v20, v22
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v19, vcc, v20, v19
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v20, v21, v22, v20
+; GFX9-NEXT:    v_mul_lo_u32 v21, v5, v8
+; GFX9-NEXT:    v_mul_lo_u32 v22, v4, v9
+; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v22
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v23
+; GFX9-NEXT:    v_mul_lo_u32 v23, v2, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v23
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v22, v22, v24, v23
+; GFX9-NEXT:    v_mul_lo_u32 v23, v1, v12
+; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v23
+; GFX9-NEXT:    v_mul_lo_u32 v23, v0, v13
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v23
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v22, v22, v24, v23
+; GFX9-NEXT:    v_mul_hi_u32 v23, v4, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v23
+; GFX9-NEXT:    v_mul_hi_u32 v23, v3, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v23
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v22, v22, v24, v23
+; GFX9-NEXT:    v_mul_hi_u32 v23, v2, v10
+; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v23
+; GFX9-NEXT:    v_mul_hi_u32 v23, v1, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v23
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v22, v22, v24, v23
+; GFX9-NEXT:    v_mul_hi_u32 v23, v0, v12
+; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v23
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v20, vcc, v21, v20
+; GFX9-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v21, v22, v23, v21
+; GFX9-NEXT:    v_mul_lo_u32 v22, v6, v8
+; GFX9-NEXT:    v_mul_lo_u32 v23, v5, v9
+; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v23
+; GFX9-NEXT:    v_mul_lo_u32 v23, v4, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v23
+; GFX9-NEXT:    v_mul_lo_u32 v23, v3, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v23
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v23, v24, v25, v23
+; GFX9-NEXT:    v_mul_lo_u32 v24, v2, v12
+; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v24
+; GFX9-NEXT:    v_mul_lo_u32 v24, v1, v13
+; GFX9-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v24
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v23, v23, v25, v24
+; GFX9-NEXT:    v_mul_lo_u32 v24, v0, v14
+; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v24
+; GFX9-NEXT:    v_mul_hi_u32 v24, v5, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v24
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v23, v23, v25, v24
+; GFX9-NEXT:    v_mul_hi_u32 v24, v4, v9
+; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v24
+; GFX9-NEXT:    v_mul_hi_u32 v24, v3, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v24
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v23, v23, v25, v24
+; GFX9-NEXT:    v_mul_hi_u32 v24, v2, v11
+; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v24
+; GFX9-NEXT:    v_mul_hi_u32 v24, v1, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v24
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v23, v23, v25, v24
+; GFX9-NEXT:    v_mul_hi_u32 v24, v0, v13
+; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v24
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v22, v21
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v22, v23, v24, v22
+; GFX9-NEXT:    v_mul_lo_u32 v23, v6, v9
+; GFX9-NEXT:    v_mul_lo_u32 v24, v4, v11
+; GFX9-NEXT:    v_mul_hi_u32 v4, v4, v10
+; GFX9-NEXT:    v_mul_hi_u32 v6, v6, v8
+; GFX9-NEXT:    v_add_u32_e32 v7, v7, v23
+; GFX9-NEXT:    v_mul_lo_u32 v23, v5, v10
+; GFX9-NEXT:    v_mul_hi_u32 v5, v5, v9
+; GFX9-NEXT:    v_mul_hi_u32 v9, v3, v11
+; GFX9-NEXT:    v_mul_hi_u32 v10, v2, v12
+; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v12
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v13
+; GFX9-NEXT:    v_mul_hi_u32 v11, v1, v13
+; GFX9-NEXT:    v_mul_lo_u32 v12, v1, v14
+; GFX9-NEXT:    v_mul_lo_u32 v13, v0, v15
+; GFX9-NEXT:    v_add3_u32 v7, v7, v23, v24
+; GFX9-NEXT:    v_add3_u32 v2, v7, v3, v2
+; GFX9-NEXT:    v_mul_lo_u32 v1, v0, v8
+; GFX9-NEXT:    v_add3_u32 v2, v2, v12, v13
+; GFX9-NEXT:    v_mul_hi_u32 v0, v0, v14
+; GFX9-NEXT:    v_add3_u32 v2, v2, v6, v5
+; GFX9-NEXT:    v_add3_u32 v2, v2, v4, v9
+; GFX9-NEXT:    v_add3_u32 v2, v2, v10, v11
+; GFX9-NEXT:    v_add3_u32 v7, v2, v0, v22
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v16
+; GFX9-NEXT:    v_mov_b32_e32 v2, v17
+; GFX9-NEXT:    v_mov_b32_e32 v3, v18
+; GFX9-NEXT:    v_mov_b32_e32 v4, v19
+; GFX9-NEXT:    v_mov_b32_e32 v5, v20
+; GFX9-NEXT:    v_mov_b32_e32 v6, v21
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i256 %num, %den
   ret i256 %result
 }


        


More information about the llvm-commits mailing list