[llvm] 32a4691 - [AMDGPU] Add tests for folding `fmul`/`fdiv` by Pow2 to `add`/`sub` of exp; NFC

Noah Goldstein via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 20 11:28:42 PDT 2023


Author: Noah Goldstein
Date: 2023-09-20T13:28:24-05:00
New Revision: 32a46919a2f3009d19a2de75d1dbb0f530aa19ce

URL: https://github.com/llvm/llvm-project/commit/32a46919a2f3009d19a2de75d1dbb0f530aa19ce
DIFF: https://github.com/llvm/llvm-project/commit/32a46919a2f3009d19a2de75d1dbb0f530aa19ce.diff

LOG: [AMDGPU] Add tests for folding `fmul`/`fdiv` by Pow2 to `add`/`sub` of exp; NFC

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D159405

Added: 
    llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
new file mode 100644
index 000000000000000..5e5d1a74690ee0b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -0,0 +1,3952 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+
+declare i16 @llvm.umax.i16(i16, i16)
+declare i64 @llvm.umin.i64(i64, i64)
+
+declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>)
+
+define <4 x float> @fmul_pow2_4xfloat(<4 x i32> %i) {
+; VI-LABEL: fmul_pow2_4xfloat:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; VI-NEXT:    v_lshlrev_b32_e64 v1, v1, 1
+; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, 1
+; VI-NEXT:    v_lshlrev_b32_e64 v3, v3, 1
+; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; VI-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; VI-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; VI-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
+; VI-NEXT:    v_mul_f32_e32 v1, 0x41100000, v1
+; VI-NEXT:    v_mul_f32_e32 v2, 0x41100000, v2
+; VI-NEXT:    v_mul_f32_e32 v3, 0x41100000, v3
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_pow2_4xfloat:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v1, 1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v2, 1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v3, 1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x41100000, v1
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x41100000, v2
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0x41100000, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_pow2_4xfloat:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; GFX11-NEXT:    v_lshlrev_b32_e64 v1, v1, 1
+; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v2, 1
+; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v3, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v0, 0x41100000, v0 :: v_dual_mul_f32 v1, 0x41100000, v1
+; GFX11-NEXT:    v_dual_mul_f32 v2, 0x41100000, v2 :: v_dual_mul_f32 v3, 0x41100000, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %p2 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %i
+  %p2_f = uitofp <4 x i32> %p2 to <4 x float>
+  %r = fmul <4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, %p2_f
+  ret <4 x float> %r
+}
+
+define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) {
+; VI-LABEL: fmul_pow2_ldexp_4xfloat:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x41100000
+; VI-NEXT:    v_ldexp_f32 v0, s4, v0
+; VI-NEXT:    v_ldexp_f32 v1, s4, v1
+; VI-NEXT:    v_ldexp_f32 v2, s4, v2
+; VI-NEXT:    v_ldexp_f32 v3, s4, v3
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_pow2_ldexp_4xfloat:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_ldexp_f32 v0, 0x41100000, v0
+; GFX10-NEXT:    v_ldexp_f32 v1, 0x41100000, v1
+; GFX10-NEXT:    v_ldexp_f32 v2, 0x41100000, v2
+; GFX10-NEXT:    v_ldexp_f32 v3, 0x41100000, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_pow2_ldexp_4xfloat:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_ldexp_f32 v0, 0x41100000, v0
+; GFX11-NEXT:    v_ldexp_f32 v1, 0x41100000, v1
+; GFX11-NEXT:    v_ldexp_f32 v2, 0x41100000, v2
+; GFX11-NEXT:    v_ldexp_f32 v3, 0x41100000, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, <4 x i32> %i)
+  ret <4 x float> %r
+}
+
+define <4 x float> @fdiv_pow2_4xfloat(<4 x i32> %i) {
+; VI-LABEL: fdiv_pow2_4xfloat:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; VI-NEXT:    s_mov_b32 s10, 0x41100000
+; VI-NEXT:    v_lshlrev_b32_e64 v1, v1, 1
+; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; VI-NEXT:    v_div_scale_f32 v4, s[4:5], v0, v0, s10
+; VI-NEXT:    v_div_scale_f32 v6, vcc, s10, v0, s10
+; VI-NEXT:    v_div_scale_f32 v5, s[4:5], v1, v1, s10
+; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, 1
+; VI-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; VI-NEXT:    v_div_scale_f32 v7, s[4:5], s10, v1, s10
+; VI-NEXT:    v_lshlrev_b32_e64 v3, v3, 1
+; VI-NEXT:    v_div_scale_f32 v9, s[6:7], v2, v2, s10
+; VI-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; VI-NEXT:    v_rcp_f32_e32 v8, v4
+; VI-NEXT:    v_rcp_f32_e32 v10, v5
+; VI-NEXT:    v_fma_f32 v11, -v4, v8, 1.0
+; VI-NEXT:    v_fma_f32 v8, v11, v8, v8
+; VI-NEXT:    v_mul_f32_e32 v11, v6, v8
+; VI-NEXT:    v_fma_f32 v12, -v4, v11, v6
+; VI-NEXT:    v_fma_f32 v11, v12, v8, v11
+; VI-NEXT:    v_fma_f32 v4, -v4, v11, v6
+; VI-NEXT:    v_div_scale_f32 v6, s[6:7], s10, v2, s10
+; VI-NEXT:    v_div_fmas_f32 v4, v4, v8, v11
+; VI-NEXT:    v_div_scale_f32 v11, s[8:9], v3, v3, s10
+; VI-NEXT:    v_fma_f32 v8, -v5, v10, 1.0
+; VI-NEXT:    v_fma_f32 v8, v8, v10, v10
+; VI-NEXT:    v_mul_f32_e32 v10, v7, v8
+; VI-NEXT:    v_fma_f32 v12, -v5, v10, v7
+; VI-NEXT:    v_fma_f32 v10, v12, v8, v10
+; VI-NEXT:    v_div_scale_f32 v12, s[8:9], s10, v3, s10
+; VI-NEXT:    v_rcp_f32_e32 v13, v9
+; VI-NEXT:    v_fma_f32 v5, -v5, v10, v7
+; VI-NEXT:    s_mov_b64 vcc, s[4:5]
+; VI-NEXT:    v_div_fmas_f32 v5, v5, v8, v10
+; VI-NEXT:    v_fma_f32 v7, -v9, v13, 1.0
+; VI-NEXT:    v_fma_f32 v7, v7, v13, v13
+; VI-NEXT:    v_mul_f32_e32 v8, v6, v7
+; VI-NEXT:    v_fma_f32 v10, -v9, v8, v6
+; VI-NEXT:    v_fma_f32 v8, v10, v7, v8
+; VI-NEXT:    v_rcp_f32_e32 v10, v11
+; VI-NEXT:    v_fma_f32 v6, -v9, v8, v6
+; VI-NEXT:    s_mov_b64 vcc, s[6:7]
+; VI-NEXT:    v_div_fmas_f32 v6, v6, v7, v8
+; VI-NEXT:    v_fma_f32 v7, -v11, v10, 1.0
+; VI-NEXT:    v_fma_f32 v7, v7, v10, v10
+; VI-NEXT:    v_mul_f32_e32 v8, v12, v7
+; VI-NEXT:    v_fma_f32 v9, -v11, v8, v12
+; VI-NEXT:    v_fma_f32 v8, v9, v7, v8
+; VI-NEXT:    v_fma_f32 v9, -v11, v8, v12
+; VI-NEXT:    s_mov_b64 vcc, s[8:9]
+; VI-NEXT:    v_div_fmas_f32 v7, v9, v7, v8
+; VI-NEXT:    v_div_fixup_f32 v0, v4, v0, s10
+; VI-NEXT:    v_div_fixup_f32 v1, v5, v1, s10
+; VI-NEXT:    v_div_fixup_f32 v2, v6, v2, s10
+; VI-NEXT:    v_div_fixup_f32 v3, v7, v3, s10
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fdiv_pow2_4xfloat:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v1, 1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v2, 1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v3, 1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX10-NEXT:    v_div_scale_f32 v4, s4, v0, v0, 0x41100000
+; GFX10-NEXT:    v_div_scale_f32 v5, s4, v1, v1, 0x41100000
+; GFX10-NEXT:    v_div_scale_f32 v6, s4, v2, v2, 0x41100000
+; GFX10-NEXT:    v_rcp_f32_e32 v8, v4
+; GFX10-NEXT:    v_div_scale_f32 v7, s4, v3, v3, 0x41100000
+; GFX10-NEXT:    v_rcp_f32_e32 v9, v5
+; GFX10-NEXT:    v_rcp_f32_e32 v10, v6
+; GFX10-NEXT:    v_div_scale_f32 v12, vcc_lo, 0x41100000, v0, 0x41100000
+; GFX10-NEXT:    v_rcp_f32_e32 v11, v7
+; GFX10-NEXT:    v_div_scale_f32 v16, s4, 0x41100000, v1, 0x41100000
+; GFX10-NEXT:    v_fma_f32 v13, -v4, v8, 1.0
+; GFX10-NEXT:    v_fma_f32 v14, -v5, v9, 1.0
+; GFX10-NEXT:    v_fma_f32 v15, -v6, v10, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v8, v13, v8
+; GFX10-NEXT:    v_div_scale_f32 v13, s5, 0x41100000, v2, 0x41100000
+; GFX10-NEXT:    v_fma_f32 v17, -v7, v11, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v9, v14, v9
+; GFX10-NEXT:    v_fmac_f32_e32 v10, v15, v10
+; GFX10-NEXT:    v_mul_f32_e32 v15, v12, v8
+; GFX10-NEXT:    v_div_scale_f32 v14, s6, 0x41100000, v3, 0x41100000
+; GFX10-NEXT:    v_fmac_f32_e32 v11, v17, v11
+; GFX10-NEXT:    v_mul_f32_e32 v17, v16, v9
+; GFX10-NEXT:    v_mul_f32_e32 v18, v13, v10
+; GFX10-NEXT:    v_fma_f32 v20, -v4, v15, v12
+; GFX10-NEXT:    v_mul_f32_e32 v19, v14, v11
+; GFX10-NEXT:    v_fma_f32 v21, -v5, v17, v16
+; GFX10-NEXT:    v_fma_f32 v22, -v6, v18, v13
+; GFX10-NEXT:    v_fmac_f32_e32 v15, v20, v8
+; GFX10-NEXT:    v_fma_f32 v23, -v7, v19, v14
+; GFX10-NEXT:    v_fmac_f32_e32 v17, v21, v9
+; GFX10-NEXT:    v_fmac_f32_e32 v18, v22, v10
+; GFX10-NEXT:    v_fma_f32 v4, -v4, v15, v12
+; GFX10-NEXT:    v_fmac_f32_e32 v19, v23, v11
+; GFX10-NEXT:    v_fma_f32 v5, -v5, v17, v16
+; GFX10-NEXT:    v_fma_f32 v6, -v6, v18, v13
+; GFX10-NEXT:    v_div_fmas_f32 v4, v4, v8, v15
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
+; GFX10-NEXT:    v_fma_f32 v7, -v7, v19, v14
+; GFX10-NEXT:    v_div_fmas_f32 v5, v5, v9, v17
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s5
+; GFX10-NEXT:    v_div_fixup_f32 v0, v4, v0, 0x41100000
+; GFX10-NEXT:    v_div_fmas_f32 v6, v6, v10, v18
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s6
+; GFX10-NEXT:    v_div_fixup_f32 v1, v5, v1, 0x41100000
+; GFX10-NEXT:    v_div_fmas_f32 v7, v7, v11, v19
+; GFX10-NEXT:    v_div_fixup_f32 v2, v6, v2, 0x41100000
+; GFX10-NEXT:    v_div_fixup_f32 v3, v7, v3, 0x41100000
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fdiv_pow2_4xfloat:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v2, 1
+; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v3, 1
+; GFX11-NEXT:    v_lshlrev_b32_e64 v1, v1, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_div_scale_f32 v4, null, v0, v0, 0x41100000
+; GFX11-NEXT:    v_div_scale_f32 v6, null, v2, v2, 0x41100000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_div_scale_f32 v5, null, v1, v1, 0x41100000
+; GFX11-NEXT:    v_div_scale_f32 v7, null, v3, v3, 0x41100000
+; GFX11-NEXT:    v_rcp_f32_e32 v8, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_rcp_f32_e32 v10, v6
+; GFX11-NEXT:    v_rcp_f32_e32 v9, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
+; GFX11-NEXT:    v_rcp_f32_e32 v11, v7
+; GFX11-NEXT:    v_div_scale_f32 v12, vcc_lo, 0x41100000, v0, 0x41100000
+; GFX11-NEXT:    v_div_scale_f32 v16, s0, 0x41100000, v1, 0x41100000
+; GFX11-NEXT:    v_fma_f32 v13, -v4, v8, 1.0
+; GFX11-NEXT:    v_fma_f32 v15, -v6, v10, 1.0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_fma_f32 v14, -v5, v9, 1.0
+; GFX11-NEXT:    v_fma_f32 v17, -v7, v11, 1.0
+; GFX11-NEXT:    v_fmac_f32_e32 v8, v13, v8
+; GFX11-NEXT:    v_fmac_f32_e32 v10, v15, v10
+; GFX11-NEXT:    v_div_scale_f32 v13, s1, 0x41100000, v2, 0x41100000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_fmac_f32_e32 v11, v17, v11
+; GFX11-NEXT:    v_fmac_f32_e32 v9, v14, v9
+; GFX11-NEXT:    v_dual_mul_f32 v15, v12, v8 :: v_dual_mul_f32 v18, v13, v10
+; GFX11-NEXT:    v_div_scale_f32 v14, s2, 0x41100000, v3, 0x41100000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_fma_f32 v20, -v4, v15, v12
+; GFX11-NEXT:    v_fma_f32 v22, -v6, v18, v13
+; GFX11-NEXT:    v_mul_f32_e32 v17, v16, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mul_f32_e32 v19, v14, v11
+; GFX11-NEXT:    v_dual_fmac_f32 v15, v20, v8 :: v_dual_fmac_f32 v18, v22, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_fma_f32 v21, -v5, v17, v16
+; GFX11-NEXT:    v_fma_f32 v23, -v7, v19, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_fma_f32 v4, -v4, v15, v12
+; GFX11-NEXT:    v_fma_f32 v6, -v6, v18, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_fmac_f32_e32 v17, v21, v9
+; GFX11-NEXT:    v_fmac_f32_e32 v19, v23, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_div_fmas_f32 v4, v4, v8, v15
+; GFX11-NEXT:    s_mov_b32 vcc_lo, s0
+; GFX11-NEXT:    v_fma_f32 v5, -v5, v17, v16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_fma_f32 v7, -v7, v19, v14
+; GFX11-NEXT:    v_div_fixup_f32 v0, v4, v0, 0x41100000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fmas_f32 v5, v5, v9, v17
+; GFX11-NEXT:    s_mov_b32 vcc_lo, s1
+; GFX11-NEXT:    v_div_fmas_f32 v6, v6, v10, v18
+; GFX11-NEXT:    s_mov_b32 vcc_lo, s2
+; GFX11-NEXT:    v_div_fixup_f32 v1, v5, v1, 0x41100000
+; GFX11-NEXT:    v_div_fmas_f32 v7, v7, v11, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f32 v2, v6, v2, 0x41100000
+; GFX11-NEXT:    v_div_fixup_f32 v3, v7, v3, 0x41100000
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %p2 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %i
+  %p2_f = uitofp <4 x i32> %p2 to <4 x float>
+  %r = fdiv <4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, %p2_f
+  ret <4 x float> %r
+}
+
+declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>)
+
+define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
+; VI-LABEL: fmul_pow2_8xhalf:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v5, 1
+; VI-NEXT:    v_lshlrev_b16_e64 v4, v3, 1
+; VI-NEXT:    v_lshlrev_b16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e64 v6, v2, 1
+; VI-NEXT:    v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e64 v7, v1, 1
+; VI-NEXT:    v_lshlrev_b16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e64 v8, v0, 1
+; VI-NEXT:    v_lshlrev_b16_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; VI-NEXT:    v_cvt_f16_u16_e32 v5, v8
+; VI-NEXT:    v_cvt_f16_u16_e32 v1, v1
+; VI-NEXT:    v_cvt_f16_u16_e32 v7, v7
+; VI-NEXT:    v_cvt_f16_u16_e32 v2, v2
+; VI-NEXT:    v_cvt_f16_u16_e32 v6, v6
+; VI-NEXT:    v_cvt_f16_u16_e32 v3, v3
+; VI-NEXT:    v_cvt_f16_u16_e32 v4, v4
+; VI-NEXT:    v_mov_b32_e32 v8, 0x7000
+; VI-NEXT:    v_mul_f16_e32 v4, 0x7000, v4
+; VI-NEXT:    v_mul_f16_sdwa v3, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_mul_f16_e32 v6, 0x7000, v6
+; VI-NEXT:    v_mul_f16_sdwa v2, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_mul_f16_e32 v7, 0x7000, v7
+; VI-NEXT:    v_mul_f16_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_mul_f16_e32 v5, 0x7000, v5
+; VI-NEXT:    v_mul_f16_sdwa v0, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v5, v0
+; VI-NEXT:    v_or_b32_e32 v1, v7, v1
+; VI-NEXT:    v_or_b32_e32 v2, v6, v2
+; VI-NEXT:    v_or_b32_e32 v3, v4, v3
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_pow2_8xhalf:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_cvt_f16_u16_e32 v4, v3
+; GFX10-NEXT:    v_cvt_f16_u16_e32 v5, v2
+; GFX10-NEXT:    v_cvt_f16_u16_e32 v6, v1
+; GFX10-NEXT:    v_cvt_f16_u16_e32 v7, v0
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_pack_b32_f16 v0, v7, v0
+; GFX10-NEXT:    v_pack_b32_f16 v1, v6, v1
+; GFX10-NEXT:    v_pack_b32_f16 v2, v5, v2
+; GFX10-NEXT:    v_pack_b32_f16 v3, v4, v3
+; GFX10-NEXT:    v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_pow2_8xhalf:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v4, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v5, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v6, v6
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v7, v7
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v6
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_pack_b32_f16 v2, v5, v2
+; GFX11-NEXT:    v_pack_b32_f16 v3, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1]
+; GFX11-NEXT:    v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
+; GFX11-NEXT:    v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
+  %p2_f = uitofp <8 x i16> %p2 to <8 x half>
+  %r = fmul <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
+  ret <8 x half> %r
+}
+
+define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
+; VI-LABEL: fmul_pow2_ldexp_8xhalf:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v5, 0x7000
+; VI-NEXT:    v_ldexp_f16_e32 v4, 0x7000, v3
+; VI-NEXT:    v_ldexp_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_ldexp_f16_e32 v6, 0x7000, v2
+; VI-NEXT:    v_ldexp_f16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_ldexp_f16_e32 v7, 0x7000, v1
+; VI-NEXT:    v_ldexp_f16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_ldexp_f16_e32 v8, 0x7000, v0
+; VI-NEXT:    v_ldexp_f16_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v0, v8, v0
+; VI-NEXT:    v_or_b32_e32 v1, v7, v1
+; VI-NEXT:    v_or_b32_e32 v2, v6, v2
+; VI-NEXT:    v_or_b32_e32 v3, v4, v3
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_pow2_ldexp_8xhalf:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0x7000
+; GFX10-NEXT:    v_ldexp_f16_e32 v5, 0x7000, v3
+; GFX10-NEXT:    v_ldexp_f16_e32 v6, 0x7000, v2
+; GFX10-NEXT:    v_ldexp_f16_e32 v7, 0x7000, v1
+; GFX10-NEXT:    v_ldexp_f16_e32 v8, 0x7000, v0
+; GFX10-NEXT:    v_ldexp_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_ldexp_f16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_ldexp_f16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_ldexp_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_pack_b32_f16 v0, v8, v0
+; GFX10-NEXT:    v_pack_b32_f16 v1, v7, v1
+; GFX10-NEXT:    v_pack_b32_f16 v2, v6, v2
+; GFX10-NEXT:    v_pack_b32_f16 v3, v5, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_pow2_ldexp_8xhalf:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_ldexp_f16_e32 v4, 0x7000, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_ldexp_f16_e32 v5, 0x7000, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_ldexp_f16_e32 v1, 0x7000, v1
+; GFX11-NEXT:    v_ldexp_f16_e32 v0, 0x7000, v0
+; GFX11-NEXT:    v_ldexp_f16_e32 v6, 0x7000, v6
+; GFX11-NEXT:    v_ldexp_f16_e32 v7, 0x7000, v7
+; GFX11-NEXT:    v_ldexp_f16_e32 v2, 0x7000, v2
+; GFX11-NEXT:    v_ldexp_f16_e32 v3, 0x7000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v6
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_pack_b32_f16 v2, v5, v2
+; GFX11-NEXT:    v_pack_b32_f16 v3, v4, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, <8 x i16> %i)
+  ret <8 x half> %r
+}
+
+define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) {
+; VI-LABEL: fdiv_pow2_8xhalf:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e64 v4, v3, 1
+; VI-NEXT:    v_mov_b32_e32 v5, 1
+; VI-NEXT:    v_lshlrev_b16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e64 v8, v0, 1
+; VI-NEXT:    v_cvt_f16_u16_e32 v4, v4
+; VI-NEXT:    v_lshlrev_b16_e64 v6, v2, 1
+; VI-NEXT:    v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e64 v7, v1, 1
+; VI-NEXT:    v_lshlrev_b16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_cvt_f16_u16_e32 v5, v8
+; VI-NEXT:    v_cvt_f32_f16_e32 v8, v4
+; VI-NEXT:    v_cvt_f16_u16_e32 v3, v3
+; VI-NEXT:    v_cvt_f32_f16_e32 v9, v3
+; VI-NEXT:    v_cvt_f16_u16_e32 v6, v6
+; VI-NEXT:    v_rcp_f32_e32 v8, v8
+; VI-NEXT:    v_cvt_f32_f16_e32 v10, v6
+; VI-NEXT:    v_rcp_f32_e32 v9, v9
+; VI-NEXT:    v_cvt_f16_u16_e32 v2, v2
+; VI-NEXT:    v_mul_f32_e32 v8, 0x46000000, v8
+; VI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; VI-NEXT:    v_mul_f32_e32 v9, 0x46000000, v9
+; VI-NEXT:    v_cvt_f32_f16_e32 v11, v2
+; VI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; VI-NEXT:    v_rcp_f32_e32 v10, v10
+; VI-NEXT:    s_movk_i32 s4, 0x7000
+; VI-NEXT:    v_cvt_f16_u16_e32 v7, v7
+; VI-NEXT:    v_div_fixup_f16 v4, v8, v4, s4
+; VI-NEXT:    v_rcp_f32_e32 v8, v11
+; VI-NEXT:    v_div_fixup_f16 v3, v9, v3, s4
+; VI-NEXT:    v_mul_f32_e32 v9, 0x46000000, v10
+; VI-NEXT:    v_cvt_f32_f16_e32 v10, v7
+; VI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; VI-NEXT:    v_mul_f32_e32 v8, 0x46000000, v8
+; VI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; VI-NEXT:    v_rcp_f32_e32 v10, v10
+; VI-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; VI-NEXT:    v_cvt_f16_u16_e32 v1, v1
+; VI-NEXT:    v_div_fixup_f16 v6, v9, v6, s4
+; VI-NEXT:    v_cvt_f32_f16_e32 v9, v1
+; VI-NEXT:    v_cvt_f32_f16_e32 v11, v0
+; VI-NEXT:    v_div_fixup_f16 v2, v8, v2, s4
+; VI-NEXT:    v_mul_f32_e32 v8, 0x46000000, v10
+; VI-NEXT:    v_cvt_f32_f16_e32 v10, v5
+; VI-NEXT:    v_rcp_f32_e32 v9, v9
+; VI-NEXT:    v_rcp_f32_e32 v11, v11
+; VI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; VI-NEXT:    v_rcp_f32_e32 v10, v10
+; VI-NEXT:    v_mul_f32_e32 v9, 0x46000000, v9
+; VI-NEXT:    v_mul_f32_e32 v11, 0x46000000, v11
+; VI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; VI-NEXT:    v_mul_f32_e32 v10, 0x46000000, v10
+; VI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; VI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; VI-NEXT:    v_div_fixup_f16 v1, v9, v1, s4
+; VI-NEXT:    v_div_fixup_f16 v7, v8, v7, s4
+; VI-NEXT:    v_div_fixup_f16 v0, v11, v0, s4
+; VI-NEXT:    v_div_fixup_f16 v5, v10, v5, s4
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_or_b32_e32 v0, v5, v0
+; VI-NEXT:    v_or_b32_e32 v1, v7, v1
+; VI-NEXT:    v_or_b32_e32 v2, v6, v2
+; VI-NEXT:    v_or_b32_e32 v3, v4, v3
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fdiv_pow2_8xhalf:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    s_mov_b32 s4, 0x46000000
+; GFX10-NEXT:    v_cvt_f16_u16_e32 v5, v3
+; GFX10-NEXT:    v_cvt_f16_u16_e32 v7, v2
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v8, v5
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f16_u16_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v10, v7
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v9, v3
+; GFX10-NEXT:    v_rcp_f32_e32 v8, v8
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v11, v2
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v12, v1
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v13, v6
+; GFX10-NEXT:    v_rcp_f32_e32 v10, v10
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v14, v0
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v15, v4
+; GFX10-NEXT:    v_rcp_f32_e32 v9, v9
+; GFX10-NEXT:    v_rcp_f32_e32 v11, v11
+; GFX10-NEXT:    v_rcp_f32_e32 v12, v12
+; GFX10-NEXT:    v_rcp_f32_e32 v14, v14
+; GFX10-NEXT:    v_rcp_f32_e32 v15, v15
+; GFX10-NEXT:    v_rcp_f32_e32 v13, v13
+; GFX10-NEXT:    v_fma_mixlo_f16 v8, v8, s4, 0
+; GFX10-NEXT:    v_fma_mixlo_f16 v10, v10, s4, 0
+; GFX10-NEXT:    v_div_fixup_f16 v5, v8, v5, 0x7000
+; GFX10-NEXT:    v_fma_mixlo_f16 v8, v9, s4, 0
+; GFX10-NEXT:    v_div_fixup_f16 v7, v10, v7, 0x7000
+; GFX10-NEXT:    v_fma_mixlo_f16 v9, v12, s4, 0
+; GFX10-NEXT:    v_fma_mixlo_f16 v10, v14, s4, 0
+; GFX10-NEXT:    v_fma_mixlo_f16 v12, v15, s4, 0
+; GFX10-NEXT:    v_fma_mixlo_f16 v13, v13, s4, 0
+; GFX10-NEXT:    v_fma_mixlo_f16 v11, v11, s4, 0
+; GFX10-NEXT:    v_div_fixup_f16 v1, v9, v1, 0x7000
+; GFX10-NEXT:    v_div_fixup_f16 v0, v10, v0, 0x7000
+; GFX10-NEXT:    v_div_fixup_f16 v4, v12, v4, 0x7000
+; GFX10-NEXT:    v_div_fixup_f16 v6, v13, v6, 0x7000
+; GFX10-NEXT:    v_div_fixup_f16 v2, v11, v2, 0x7000
+; GFX10-NEXT:    v_div_fixup_f16 v3, v8, v3, 0x7000
+; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v4
+; GFX10-NEXT:    v_pack_b32_f16 v1, v1, v6
+; GFX10-NEXT:    v_pack_b32_f16 v2, v7, v2
+; GFX10-NEXT:    v_pack_b32_f16 v3, v5, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fdiv_pow2_8xhalf:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    s_mov_b32 s0, 0x46000000
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v4, v3
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v5, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v8, v5
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v6, v6
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v9, v9
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v2, v2
+; GFX11-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX11-NEXT:    v_rcp_f32_e32 v8, v8
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v10, v3
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v11, v1
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v12, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v13, v6
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v14, v9
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v15, v2
+; GFX11-NEXT:    v_rcp_f32_e32 v10, v10
+; GFX11-NEXT:    v_rcp_f32_e32 v11, v11
+; GFX11-NEXT:    v_rcp_f32_e32 v12, v12
+; GFX11-NEXT:    v_rcp_f32_e32 v13, v13
+; GFX11-NEXT:    v_rcp_f32_e32 v14, v14
+; GFX11-NEXT:    v_rcp_f32_e32 v15, v15
+; GFX11-NEXT:    v_fma_mixlo_f16 v7, v7, s0, 0
+; GFX11-NEXT:    v_fma_mixlo_f16 v8, v8, s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_div_fixup_f16 v4, v7, v4, 0x7000
+; GFX11-NEXT:    v_fma_mixlo_f16 v7, v10, s0, 0
+; GFX11-NEXT:    v_div_fixup_f16 v5, v8, v5, 0x7000
+; GFX11-NEXT:    v_fma_mixlo_f16 v8, v11, s0, 0
+; GFX11-NEXT:    v_fma_mixlo_f16 v10, v12, s0, 0
+; GFX11-NEXT:    v_fma_mixlo_f16 v11, v13, s0, 0
+; GFX11-NEXT:    v_fma_mixlo_f16 v12, v14, s0, 0
+; GFX11-NEXT:    v_fma_mixlo_f16 v13, v15, s0, 0
+; GFX11-NEXT:    v_div_fixup_f16 v1, v8, v1, 0x7000
+; GFX11-NEXT:    v_div_fixup_f16 v0, v10, v0, 0x7000
+; GFX11-NEXT:    v_div_fixup_f16 v6, v11, v6, 0x7000
+; GFX11-NEXT:    v_div_fixup_f16 v8, v12, v9, 0x7000
+; GFX11-NEXT:    v_div_fixup_f16 v2, v13, v2, 0x7000
+; GFX11-NEXT:    v_div_fixup_f16 v3, v7, v3, 0x7000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v6
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_pack_b32_f16 v2, v5, v2
+; GFX11-NEXT:    v_pack_b32_f16 v3, v4, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
+  %p2_f = uitofp <8 x i16> %p2 to <8 x half>
+  %r = fdiv <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
+  ret <8 x half> %r
+}
+
+define double @fmul_pow_shl_cnt(i64 %cnt) nounwind {
+; CHECK-SSE-LABEL: fmul_pow_shl_cnt:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    movq %rdi, %rcx
+; CHECK-SSE-NEXT:    movl $1, %eax
+; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-SSE-NEXT:    shlq %cl, %rax
+; CHECK-SSE-NEXT:    movq %rax, %xmm1
+; CHECK-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; CHECK-SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT:    movapd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-SSE-NEXT:    addsd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fmul_pow_shl_cnt:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    movq %rdi, %rcx
+; CHECK-AVX2-NEXT:    movl $1, %eax
+; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-AVX2-NEXT:    shlq %cl, %rax
+; CHECK-AVX2-NEXT:    vmovq %rax, %xmm0
+; CHECK-AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-AVX2-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; CHECK-AVX2-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
+; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
+; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2sd %rax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fmul_pow_shl_cnt:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    movl $1, %eax
+; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
+; CHECK-FMA-NEXT:    vcvtusi2sd %rax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fmul_pow_shl_cnt:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
+; VI-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
+; VI-NEXT:    s_mov_b32 s5, 0x40220000
+; VI-NEXT:    v_ldexp_f64 v[1:2], v[1:2], 32
+; VI-NEXT:    v_add_f64 v[0:1], v[1:2], v[3:4]
+; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_pow_shl_cnt:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s5, 0x40220000
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
+; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_pow_shl_cnt:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s1, 0x40220000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
+; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nuw i64 1, %cnt
+  %conv = uitofp i64 %shl to double
+  %mul = fmul double 9.000000e+00, %conv
+  ret double %mul
+}
+
+define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind {
+; CHECK-SSE-LABEL: fmul_pow_shl_cnt2:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    movq %rdi, %rcx
+; CHECK-SSE-NEXT:    movl $2, %eax
+; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-SSE-NEXT:    shlq %cl, %rax
+; CHECK-SSE-NEXT:    movq %rax, %xmm1
+; CHECK-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; CHECK-SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT:    movapd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-SSE-NEXT:    addsd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fmul_pow_shl_cnt2:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    movq %rdi, %rcx
+; CHECK-AVX2-NEXT:    movl $2, %eax
+; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-AVX2-NEXT:    shlq %cl, %rax
+; CHECK-AVX2-NEXT:    vmovq %rax, %xmm0
+; CHECK-AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-AVX2-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; CHECK-AVX2-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt2:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
+; CHECK-NO-FASTFMA-NEXT:    movl $2, %eax
+; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2sd %rax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fmul_pow_shl_cnt2:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    movl $2, %eax
+; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
+; CHECK-FMA-NEXT:    vcvtusi2sd %rax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fmul_pow_shl_cnt2:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
+; VI-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
+; VI-NEXT:    s_mov_b32 s5, 0xc0220000
+; VI-NEXT:    v_ldexp_f64 v[1:2], v[1:2], 32
+; VI-NEXT:    v_add_f64 v[0:1], v[1:2], v[3:4]
+; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_pow_shl_cnt2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s5, 0xc0220000
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
+; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_pow_shl_cnt2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s1, 0xc0220000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
+; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nuw i64 2, %cnt
+  %conv = uitofp i64 %shl to double
+  %mul = fmul double -9.000000e+00, %conv
+  ret double %mul
+}
+
+define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind {
+; CHECK-SSE-LABEL: fmul_pow_select:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    movl %edi, %ecx
+; CHECK-SSE-NEXT:    andl $1, %esi
+; CHECK-SSE-NEXT:    movl $2, %eax
+; CHECK-SSE-NEXT:    subl %esi, %eax
+; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-SSE-NEXT:    shll %cl, %eax
+; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm0
+; CHECK-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fmul_pow_select:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    movl %edi, %ecx
+; CHECK-AVX2-NEXT:    andl $1, %esi
+; CHECK-AVX2-NEXT:    movl $2, %eax
+; CHECK-AVX2-NEXT:    subl %esi, %eax
+; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-AVX2-NEXT:    shll %cl, %eax
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fmul_pow_select:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
+; CHECK-NO-FASTFMA-NEXT:    andl $1, %esi
+; CHECK-NO-FASTFMA-NEXT:    movl $2, %eax
+; CHECK-NO-FASTFMA-NEXT:    subl %esi, %eax
+; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fmul_pow_select:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    andl $1, %esi
+; CHECK-FMA-NEXT:    movl $2, %eax
+; CHECK-FMA-NEXT:    subl %esi, %eax
+; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
+; CHECK-FMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fmul_pow_select:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v1, 1, v1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; VI-NEXT:    v_cndmask_b32_e64 v1, 2, 1, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
+; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; VI-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_pow_select:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 2, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_pow_select:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 2, 1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl2 = shl nuw i32 2, %cnt
+  %shl1 = shl nuw i32 1, %cnt
+  %shl = select i1 %c, i32 %shl1, i32 %shl2
+  %conv = uitofp i32 %shl to float
+  %mul = fmul float 9.000000e+00, %conv
+  ret float %mul
+}
+
+define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind {
+; CHECK-SSE-LABEL: fmul_fly_pow_mul_min_pow2:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    movq %rdi, %rcx
+; CHECK-SSE-NEXT:    movl $8, %eax
+; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-SSE-NEXT:    shlq %cl, %rax
+; CHECK-SSE-NEXT:    cmpq $8192, %rax # imm = 0x2000
+; CHECK-SSE-NEXT:    movl $8192, %ecx # imm = 0x2000
+; CHECK-SSE-NEXT:    cmovbq %rax, %rcx
+; CHECK-SSE-NEXT:    cvtsi2ss %rcx, %xmm0
+; CHECK-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fmul_fly_pow_mul_min_pow2:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    movq %rdi, %rcx
+; CHECK-AVX2-NEXT:    movl $8, %eax
+; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-AVX2-NEXT:    shlq %cl, %rax
+; CHECK-AVX2-NEXT:    cmpq $8192, %rax # imm = 0x2000
+; CHECK-AVX2-NEXT:    movl $8192, %ecx # imm = 0x2000
+; CHECK-AVX2-NEXT:    cmovbq %rax, %rcx
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rcx, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fmul_fly_pow_mul_min_pow2:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
+; CHECK-NO-FASTFMA-NEXT:    movl $8, %eax
+; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
+; CHECK-NO-FASTFMA-NEXT:    cmpq $8192, %rax # imm = 0x2000
+; CHECK-NO-FASTFMA-NEXT:    movl $8192, %ecx # imm = 0x2000
+; CHECK-NO-FASTFMA-NEXT:    cmovbq %rax, %rcx
+; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rcx, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fmul_fly_pow_mul_min_pow2:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    movl $8, %eax
+; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
+; CHECK-FMA-NEXT:    cmpq $8192, %rax # imm = 0x2000
+; CHECK-FMA-NEXT:    movl $8192, %ecx # imm = 0x2000
+; CHECK-FMA-NEXT:    cmovbq %rax, %rcx
+; CHECK-FMA-NEXT:    vcvtsi2ss %rcx, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fmul_fly_pow_mul_min_pow2:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
+; VI-NEXT:    s_mov_b64 s[4:5], 0x2000
+; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v2, 0x2000
+; VI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_ffbh_u32_e32 v2, v1
+; VI-NEXT:    v_min_u32_e32 v2, 32, v2
+; VI-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; VI-NEXT:    v_min_u32_e32 v0, 1, v0
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
+; VI-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_fly_pow_mul_min_pow2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
+; GFX10-NEXT:    s_mov_b64 s[4:5], 0x2000
+; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x2000, v0, vcc_lo
+; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
+; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_fly_pow_mul_min_pow2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
+; GFX11-NEXT:    s_mov_b64 s[0:1], 0x2000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x2000, v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v1
+; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl8 = shl nuw i64 8, %cnt
+  %shl = call i64 @llvm.umin.i64(i64 %shl8, i64 8192)
+  %conv = uitofp i64 %shl to float
+  %mul = fmul float 9.000000e+00, %conv
+  ret float %mul
+}
+
+define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind {
+; CHECK-SSE-LABEL: fmul_pow_mul_max_pow2:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    movl %edi, %ecx
+; CHECK-SSE-NEXT:    movl $2, %eax
+; CHECK-SSE-NEXT:    shll %cl, %eax
+; CHECK-SSE-NEXT:    movl $1, %edx
+; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-SSE-NEXT:    shll %cl, %edx
+; CHECK-SSE-NEXT:    cmpw %ax, %dx
+; CHECK-SSE-NEXT:    cmovbel %eax, %edx
+; CHECK-SSE-NEXT:    movzwl %dx, %eax
+; CHECK-SSE-NEXT:    cvtsi2sd %eax, %xmm0
+; CHECK-SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fmul_pow_mul_max_pow2:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    movl %edi, %ecx
+; CHECK-AVX2-NEXT:    movl $2, %eax
+; CHECK-AVX2-NEXT:    shll %cl, %eax
+; CHECK-AVX2-NEXT:    movl $1, %edx
+; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-AVX2-NEXT:    shll %cl, %edx
+; CHECK-AVX2-NEXT:    cmpw %ax, %dx
+; CHECK-AVX2-NEXT:    cmovbel %eax, %edx
+; CHECK-AVX2-NEXT:    movzwl %dx, %eax
+; CHECK-AVX2-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fmul_pow_mul_max_pow2:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
+; CHECK-NO-FASTFMA-NEXT:    movl $2, %eax
+; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
+; CHECK-NO-FASTFMA-NEXT:    movl $1, %edx
+; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NO-FASTFMA-NEXT:    shll %cl, %edx
+; CHECK-NO-FASTFMA-NEXT:    cmpw %ax, %dx
+; CHECK-NO-FASTFMA-NEXT:    cmovbel %eax, %edx
+; CHECK-NO-FASTFMA-NEXT:    movzwl %dx, %eax
+; CHECK-NO-FASTFMA-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fmul_pow_mul_max_pow2:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    movl $2, %eax
+; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
+; CHECK-FMA-NEXT:    movl $1, %ecx
+; CHECK-FMA-NEXT:    shlxl %edi, %ecx, %ecx
+; CHECK-FMA-NEXT:    cmpw %ax, %cx
+; CHECK-FMA-NEXT:    cmoval %ecx, %eax
+; CHECK-FMA-NEXT:    movzwl %ax, %eax
+; CHECK-FMA-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fmul_pow_mul_max_pow2:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e64 v0, v0, 2
+; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    s_mov_b32 s5, 0x40080000
+; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_pow_mul_max_pow2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b16 v0, v0, 2
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s5, 0x40080000
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_pow_mul_max_pow2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b16 v0, v0, 2
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s1, 0x40080000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl2 = shl nuw i16 2, %cnt
+  %shl1 = shl nuw i16 1, %cnt
+  %shl = call i16 @llvm.umax.i16(i16 %shl1, i16 %shl2)
+  %conv = uitofp i16 %shl to double
+  %mul = fmul double 3.000000e+00, %conv
+  ret double %mul
+}
+
+define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind {
+; CHECK-SSE-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    movq %rsi, %rcx
+; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-SSE-NEXT:    shlq %cl, %rdi
+; CHECK-SSE-NEXT:    movq %rdi, %xmm1
+; CHECK-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; CHECK-SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT:    movapd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-SSE-NEXT:    addsd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    movq %rsi, %rcx
+; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-AVX2-NEXT:    shlq %cl, %rdi
+; CHECK-AVX2-NEXT:    vmovq %rdi, %xmm0
+; CHECK-AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-AVX2-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; CHECK-AVX2-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    movq %rsi, %rcx
+; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rdi
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2sd %rdi, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    shlxq %rsi, %rdi, %rax
+; CHECK-FMA-NEXT:    vcvtusi2sd %rax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
+; VI-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
+; VI-NEXT:    s_mov_b32 s5, 0x40220000
+; VI-NEXT:    v_ldexp_f64 v[1:2], v[1:2], 32
+; VI-NEXT:    v_add_f64 v[0:1], v[1:2], v[3:4]
+; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s5, 0x40220000
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
+; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s1, 0x40220000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
+; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nuw i64 %v, %cnt
+  %conv = uitofp i64 %shl to double
+  %mul = fmul double 9.000000e+00, %conv
+  ret double %mul
+}
+
+define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nounwind {
+; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [2,2]
+; CHECK-SSE-NEXT:    movdqa %xmm3, %xmm1
+; CHECK-SSE-NEXT:    psllq %xmm2, %xmm1
+; CHECK-SSE-NEXT:    psllq %xmm0, %xmm3
+; CHECK-SSE-NEXT:    movq %xmm3, %rax
+; CHECK-SSE-NEXT:    testq %rax, %rax
+; CHECK-SSE-NEXT:    js .LBB6_1
+; CHECK-SSE-NEXT:  # %bb.2:
+; CHECK-SSE-NEXT:    xorps %xmm0, %xmm0
+; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm0
+; CHECK-SSE-NEXT:    jmp .LBB6_3
+; CHECK-SSE-NEXT:  .LBB6_1:
+; CHECK-SSE-NEXT:    movq %rax, %rcx
+; CHECK-SSE-NEXT:    shrq %rcx
+; CHECK-SSE-NEXT:    andl $1, %eax
+; CHECK-SSE-NEXT:    orq %rcx, %rax
+; CHECK-SSE-NEXT:    xorps %xmm0, %xmm0
+; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm0
+; CHECK-SSE-NEXT:    addss %xmm0, %xmm0
+; CHECK-SSE-NEXT:  .LBB6_3:
+; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; CHECK-SSE-NEXT:    movq %xmm1, %rax
+; CHECK-SSE-NEXT:    testq %rax, %rax
+; CHECK-SSE-NEXT:    js .LBB6_4
+; CHECK-SSE-NEXT:  # %bb.5:
+; CHECK-SSE-NEXT:    xorps %xmm1, %xmm1
+; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
+; CHECK-SSE-NEXT:    jmp .LBB6_6
+; CHECK-SSE-NEXT:  .LBB6_4:
+; CHECK-SSE-NEXT:    movq %rax, %rcx
+; CHECK-SSE-NEXT:    shrq %rcx
+; CHECK-SSE-NEXT:    andl $1, %eax
+; CHECK-SSE-NEXT:    orq %rcx, %rax
+; CHECK-SSE-NEXT:    xorps %xmm1, %xmm1
+; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
+; CHECK-SSE-NEXT:    addss %xmm1, %xmm1
+; CHECK-SSE-NEXT:  .LBB6_6:
+; CHECK-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
+; CHECK-AVX2-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlq $1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; CHECK-AVX2-NEXT:    vmovq %xmm1, %rax
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
+; CHECK-AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
+; CHECK-AVX2-NEXT:    vaddps %xmm1, %xmm1, %xmm2
+; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0
+; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; CHECK-AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1]
+; CHECK-AVX2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
+; CHECK-NO-FASTFMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vpextrq $1, %xmm0, %rax
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm1
+; CHECK-NO-FASTFMA-NEXT:    vmovq %xmm0, %rax
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1]
+; CHECK-NO-FASTFMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
+; CHECK-FMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    vcvtuqq2ps %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b64 v[1:2], v2, 2
+; VI-NEXT:    v_ffbh_u32_e32 v3, v2
+; VI-NEXT:    v_min_u32_e32 v5, 32, v3
+; VI-NEXT:    v_lshlrev_b64 v[1:2], v5, v[1:2]
+; VI-NEXT:    v_lshlrev_b64 v[3:4], v0, 2
+; VI-NEXT:    v_min_u32_e32 v0, 1, v1
+; VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-NEXT:    v_cvt_f32_u32_e32 v2, v0
+; VI-NEXT:    v_ffbh_u32_e32 v0, v4
+; VI-NEXT:    v_min_u32_e32 v6, 32, v0
+; VI-NEXT:    v_lshlrev_b64 v[0:1], v6, v[3:4]
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, 32, v5
+; VI-NEXT:    v_min_u32_e32 v0, 1, v0
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; VI-NEXT:    v_ldexp_f32 v1, v2, v3
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
+; VI-NEXT:    v_ldexp_f32 v0, v0, v2
+; VI-NEXT:    v_mul_f32_e32 v0, 0x41700000, v0
+; VI-NEXT:    v_mul_f32_e32 v1, 0x41700000, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
+; GFX10-NEXT:    v_ffbh_u32_e32 v4, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v5, v3
+; GFX10-NEXT:    v_min_u32_e32 v4, 32, v4
+; GFX10-NEXT:    v_min_u32_e32 v5, 32, v5
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
+; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 32, v5
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v4
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x41700000, v0
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x41700000, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_clz_i32_u32_e32 v4, v1
+; GFX11-NEXT:    v_clz_i32_u32_e32 v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_u32_e32 v4, 32, v4
+; GFX11-NEXT:    v_min_u32_e32 v5, 32, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 32, v5
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v0, 0x41700000, v0 :: v_dual_mul_f32 v1, 0x41700000, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
+  %conv = uitofp <2 x i64> %shl to <2 x float>
+  %mul = fmul <2 x float> <float 15.000000e+00, float 15.000000e+00>, %conv
+  ret <2 x float> %mul
+}
+
+define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
+; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2,2]
+; CHECK-SSE-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-SSE-NEXT:    psllq %xmm0, %xmm2
+; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; CHECK-SSE-NEXT:    psllq %xmm0, %xmm1
+; CHECK-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; CHECK-SSE-NEXT:    movapd {{.*#+}} xmm0 = [4294967295,4294967295]
+; CHECK-SSE-NEXT:    andpd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    psrlq $32, %xmm1
+; CHECK-SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT:    addpd %xmm0, %xmm1
+; CHECK-SSE-NEXT:    mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT:    movapd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
+; CHECK-AVX2-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; CHECK-AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
+; CHECK-NO-FASTFMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NO-FASTFMA-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; CHECK-NO-FASTFMA-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-NO-FASTFMA-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
+; CHECK-FMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    vcvtuqq2pd %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fmul_pow_shl_cnt_vec:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
+; VI-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
+; VI-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
+; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
+; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
+; VI-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
+; VI-NEXT:    v_cvt_f64_u32_e32 v[7:8], v2
+; VI-NEXT:    s_mov_b32 s5, 0x402e0000
+; VI-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
+; VI-NEXT:    v_add_f64 v[2:3], v[5:6], v[7:8]
+; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_pow_shl_cnt_vec:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s5, 0x402e0000
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
+; GFX10-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
+; GFX10-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
+; GFX10-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
+; GFX10-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_pow_shl_cnt_vec:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s1, 0x402e0000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
+; GFX11-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
+; GFX11-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    v_mul_f64 v[2:3], v[2:3], s[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
+  %conv = uitofp <2 x i64> %shl to <2 x double>
+  %mul = fmul <2 x double> <double 15.000000e+00, double 15.000000e+00>, %conv
+  ret <2 x double> %mul
+}
+
+define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float> %add) nounwind {
+; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    pslld $23, %xmm0
+; CHECK-SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,2,2]
+; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE-NEXT:    pmuludq %xmm2, %xmm0
+; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; CHECK-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; CHECK-SSE-NEXT:    pand %xmm0, %xmm2
+; CHECK-SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE-NEXT:    psrld $16, %xmm0
+; CHECK-SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    addps %xmm2, %xmm0
+; CHECK-SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    addps %xmm1, %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2]
+; CHECK-AVX2-NEXT:    vpsllvd %xmm0, %xmm2, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
+; CHECK-AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; CHECK-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1392508928,1392508928,1392508928,1392508928]
+; CHECK-AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
+; CHECK-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; CHECK-AVX2-NEXT:    vsubps %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; CHECK-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0]
+; CHECK-AVX2-NEXT:    vmulps %xmm2, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2]
+; CHECK-NO-FASTFMA-NEXT:    vpsllvd %xmm0, %xmm2, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; CHECK-NO-FASTFMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0]
+; CHECK-NO-FASTFMA-NEXT:    vmulps %xmm2, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vzeroupper
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2]
+; CHECK-FMA-NEXT:    vpsllvd %xmm0, %xmm2, %xmm0
+; CHECK-FMA-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e64 v3, v3, 2
+; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, 2
+; VI-NEXT:    v_lshlrev_b32_e64 v1, v1, 2
+; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, 2
+; VI-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; VI-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; VI-NEXT:    v_mul_f32_e32 v3, 0x40a00000, v3
+; VI-NEXT:    v_mul_f32_e32 v2, 0x40a00000, v2
+; VI-NEXT:    v_mul_f32_e32 v1, 0x40a00000, v1
+; VI-NEXT:    v_mul_f32_e32 v0, 0x40a00000, v0
+; VI-NEXT:    v_add_f32_e32 v0, v0, v4
+; VI-NEXT:    v_add_f32_e32 v1, v1, v5
+; VI-NEXT:    v_add_f32_e32 v2, v2, v6
+; VI-NEXT:    v_add_f32_e32 v3, v3, v7
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, 2
+; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v1, 2
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v2, 2
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v3, 2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x40a00000, v0
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x40a00000, v1
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x40a00000, v2
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0x40a00000, v3
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, 2
+; GFX11-NEXT:    v_lshlrev_b32_e64 v1, v1, 2
+; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v2, 2
+; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v3, 2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v0, 0x40a00000, v0 :: v_dual_mul_f32 v1, 0x40a00000, v1
+; GFX11-NEXT:    v_dual_mul_f32 v2, 0x40a00000, v2 :: v_dual_mul_f32 v3, 0x40a00000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
+; GFX11-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nsw nuw <4 x i32> <i32 2, i32 2, i32 2, i32 2>, %cnt
+  %conv = uitofp <4 x i32> %shl to <4 x float>
+  %mul = fmul <4 x float> <float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00>, %conv
+  %res = fadd <4 x float> %mul, %add
+  ret <4 x float> %res
+}
+
+define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwind {
+; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2,2]
+; CHECK-SSE-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-SSE-NEXT:    psllq %xmm0, %xmm2
+; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; CHECK-SSE-NEXT:    psllq %xmm0, %xmm1
+; CHECK-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; CHECK-SSE-NEXT:    movapd {{.*#+}} xmm0 = [4294967295,4294967295]
+; CHECK-SSE-NEXT:    andpd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    psrlq $32, %xmm1
+; CHECK-SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT:    addpd %xmm0, %xmm1
+; CHECK-SSE-NEXT:    mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT:    movapd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
+; CHECK-AVX2-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; CHECK-AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
+; CHECK-NO-FASTFMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NO-FASTFMA-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; CHECK-NO-FASTFMA-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-NO-FASTFMA-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
+; CHECK-FMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    vcvtuqq2pd %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
+; VI-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
+; VI-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
+; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
+; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; VI-NEXT:    s_mov_b32 s5, 0x402e0000
+; VI-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
+; VI-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
+; VI-NEXT:    v_cvt_f64_u32_e32 v[7:8], v2
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
+; VI-NEXT:    v_add_f64 v[2:3], v[5:6], v[7:8]
+; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    s_mov_b32 s5, 0x402c0000
+; VI-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
+; GFX10-NEXT:    s_mov_b32 s5, 0x402e0000
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
+; GFX10-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
+; GFX10-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
+; GFX10-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
+; GFX10-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    s_mov_b32 s5, 0x402c0000
+; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
+; GFX11-NEXT:    s_mov_b32 s1, 0x402e0000
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
+; GFX11-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
+; GFX11-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s1, 0x402c0000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mul_f64 v[2:3], v[2:3], s[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
+  %conv = uitofp <2 x i64> %shl to <2 x double>
+  %mul = fmul <2 x double> <double 15.000000e+00, double 14.000000e+00>, %conv
+  ret <2 x double> %mul
+}
+
+define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwind {
+; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2,1]
+; CHECK-SSE-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-SSE-NEXT:    psllq %xmm0, %xmm2
+; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; CHECK-SSE-NEXT:    psllq %xmm0, %xmm1
+; CHECK-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; CHECK-SSE-NEXT:    movapd {{.*#+}} xmm0 = [4294967295,4294967295]
+; CHECK-SSE-NEXT:    andpd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    psrlq $32, %xmm1
+; CHECK-SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT:    addpd %xmm0, %xmm1
+; CHECK-SSE-NEXT:    mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT:    movapd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,1]
+; CHECK-AVX2-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; CHECK-AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,1]
+; CHECK-NO-FASTFMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NO-FASTFMA-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; CHECK-NO-FASTFMA-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-NO-FASTFMA-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,1]
+; CHECK-FMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    vcvtuqq2pd %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
+; VI-NEXT:    v_lshlrev_b64 v[2:3], v2, 1
+; VI-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
+; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
+; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
+; VI-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
+; VI-NEXT:    v_cvt_f64_u32_e32 v[7:8], v2
+; VI-NEXT:    s_mov_b32 s5, 0x402e0000
+; VI-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
+; VI-NEXT:    v_add_f64 v[2:3], v[5:6], v[7:8]
+; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 1
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s5, 0x402e0000
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
+; GFX10-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
+; GFX10-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
+; GFX10-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
+; GFX10-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 1
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s1, 0x402e0000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
+; GFX11-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
+; GFX11-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    v_mul_f64 v[2:3], v[2:3], s[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nsw nuw <2 x i64> <i64 2, i64 1>, %cnt
+  %conv = uitofp <2 x i64> %shl to <2 x double>
+  %mul = fmul <2 x double> <double 15.000000e+00, double 15.000000e+00>, %conv
+  ret <2 x double> %mul
+}
+
+define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
+; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    subq $40, %rsp
+; CHECK-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-SSE-NEXT:    pslld $23, %xmm0
+; CHECK-SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; CHECK-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; CHECK-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-SSE-NEXT:    pextrw $1, %xmm0, %eax
+; CHECK-SSE-NEXT:    xorps %xmm0, %xmm0
+; CHECK-SSE-NEXT:    cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-SSE-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-SSE-NEXT:    xorps %xmm0, %xmm0
+; CHECK-SSE-NEXT:    cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-SSE-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-SSE-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-SSE-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-SSE-NEXT:    addq $40, %rsp
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    subq $40, %rsp
+; CHECK-AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2]
+; CHECK-AVX2-NEXT:    vpsllvd %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-AVX2-NEXT:    vpextrw $2, %xmm0, %eax
+; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm0
+; CHECK-AVX2-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-AVX2-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-AVX2-NEXT:    vpextrw $0, %xmm0, %eax
+; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm0
+; CHECK-AVX2-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-AVX2-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-AVX2-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-AVX2-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-AVX2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-AVX2-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-AVX2-NEXT:    addq $40, %rsp
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NO-FASTFMA-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0]
+; CHECK-NO-FASTFMA-NEXT:    vpsllvd %ymm0, %ymm1, %ymm0
+; CHECK-NO-FASTFMA-NEXT:    vpmovdw %zmm0, %ymm1
+; CHECK-NO-FASTFMA-NEXT:    vpextrw $0, %xmm0, %eax
+; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmovd %xmm0, %eax
+; CHECK-NO-FASTFMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vpextrw $1, %xmm1, %eax
+; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm1
+; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
+; CHECK-NO-FASTFMA-NEXT:    vmovd %xmm1, %eax
+; CHECK-NO-FASTFMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
+; CHECK-NO-FASTFMA-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NO-FASTFMA-NEXT:    vmovaps {{.*#+}} xmm1 = [16,0,0,0]
+; CHECK-NO-FASTFMA-NEXT:    xorl %eax, %eax
+; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm2
+; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
+; CHECK-NO-FASTFMA-NEXT:    vmovd %xmm2, %eax
+; CHECK-NO-FASTFMA-NEXT:    vmovd %eax, %xmm2
+; CHECK-NO-FASTFMA-NEXT:    vpbroadcastw %xmm2, %xmm2
+; CHECK-NO-FASTFMA-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm2
+; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm2, %ymm0
+; CHECK-NO-FASTFMA-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1]
+; CHECK-NO-FASTFMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vzeroupper
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
+; CHECK-FMA-NEXT:    vpsllvw %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    vpextrw $7, %xmm0, %eax
+; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm1
+; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
+; CHECK-FMA-NEXT:    vmovd %xmm1, %eax
+; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
+; CHECK-FMA-NEXT:    vpextrw $6, %xmm0, %eax
+; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm2
+; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
+; CHECK-FMA-NEXT:    vmovd %xmm2, %eax
+; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
+; CHECK-FMA-NEXT:    vpextrw $5, %xmm0, %eax
+; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm3, %xmm3
+; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-FMA-NEXT:    vmovd %xmm3, %eax
+; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm3
+; CHECK-FMA-NEXT:    vpextrw $4, %xmm0, %eax
+; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm4, %xmm4
+; CHECK-FMA-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm4, %xmm2
+; CHECK-FMA-NEXT:    vmovd %xmm2, %eax
+; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
+; CHECK-FMA-NEXT:    vpextrw $3, %xmm0, %eax
+; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm5, %xmm4
+; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-FMA-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; CHECK-FMA-NEXT:    vmovd %xmm4, %eax
+; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm3
+; CHECK-FMA-NEXT:    vpextrw $2, %xmm0, %eax
+; CHECK-FMA-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm5, %xmm2
+; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
+; CHECK-FMA-NEXT:    vmovd %xmm2, %eax
+; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
+; CHECK-FMA-NEXT:    vpextrw $1, %xmm0, %eax
+; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm5, %xmm4
+; CHECK-FMA-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm4, %xmm3
+; CHECK-FMA-NEXT:    vmovd %xmm3, %eax
+; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm3
+; CHECK-FMA-NEXT:    vpextrw $0, %xmm0, %eax
+; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm5, %xmm0
+; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmovd %xmm0, %eax
+; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; CHECK-FMA-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-FMA-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %ymm0
+; CHECK-FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; CHECK-FMA-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; CHECK-FMA-NEXT:    vzeroupper
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, 2
+; VI-NEXT:    v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e64 v0, v0, 2
+; VI-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; VI-NEXT:    v_cvt_f16_u16_e32 v1, v1
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4b80
+; VI-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_mul_f16_e32 v0, 0x4b80, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_cvt_f16_u16_e32 v1, v0
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nsw nuw <2 x i16> <i16 2, i16 2>, %cnt
+  %conv = uitofp <2 x i16> %shl to <2 x half>
+  %mul = fmul <2 x half> <half 15.000000e+00, half 15.000000e+00>, %conv
+  ret <2 x half> %mul
+}
+
+define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind {
+; CHECK-SSE-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    movq %rdi, %rcx
+; CHECK-SSE-NEXT:    movl $1, %eax
+; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-SSE-NEXT:    shlq %cl, %rax
+; CHECK-SSE-NEXT:    movq %rax, %xmm1
+; CHECK-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; CHECK-SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT:    movapd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-SSE-NEXT:    addsd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    movq %rdi, %rcx
+; CHECK-AVX2-NEXT:    movl $1, %eax
+; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-AVX2-NEXT:    shlq %cl, %rax
+; CHECK-AVX2-NEXT:    vmovq %rax, %xmm0
+; CHECK-AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-AVX2-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; CHECK-AVX2-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
+; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
+; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2sd %rax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    movl $1, %eax
+; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
+; CHECK-FMA-NEXT:    vcvtusi2sd %rax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
+; VI-NEXT:    s_mov_b32 s4, 0xff5f3992
+; VI-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
+; VI-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
+; VI-NEXT:    s_mov_b32 s5, 0x7befffff
+; VI-NEXT:    v_ldexp_f64 v[1:2], v[1:2], 32
+; VI-NEXT:    v_add_f64 v[0:1], v[1:2], v[3:4]
+; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
+; GFX10-NEXT:    s_mov_b32 s4, 0xff5f3992
+; GFX10-NEXT:    s_mov_b32 s5, 0x7befffff
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
+; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
+; GFX11-NEXT:    s_mov_b32 s0, 0xff5f3992
+; GFX11-NEXT:    s_mov_b32 s1, 0x7befffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
+; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nuw i64 1, %cnt
+  %conv = uitofp i64 %shl to double
+  %mul = fmul double 9.745314e+288, %conv
+  ret double %mul
+}
+
+define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind {
+; CHECK-SSE-LABEL: fmul_pow_shl_cnt_safe:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    movl %edi, %ecx
+; CHECK-SSE-NEXT:    movl $1, %eax
+; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-SSE-NEXT:    shll %cl, %eax
+; CHECK-SSE-NEXT:    movzwl %ax, %eax
+; CHECK-SSE-NEXT:    cvtsi2sd %eax, %xmm0
+; CHECK-SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_safe:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    movl %edi, %ecx
+; CHECK-AVX2-NEXT:    movl $1, %eax
+; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-AVX2-NEXT:    shll %cl, %eax
+; CHECK-AVX2-NEXT:    movzwl %ax, %eax
+; CHECK-AVX2-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_safe:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
+; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
+; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
+; CHECK-NO-FASTFMA-NEXT:    movzwl %ax, %eax
+; CHECK-NO-FASTFMA-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fmul_pow_shl_cnt_safe:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    movl $1, %eax
+; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
+; CHECK-FMA-NEXT:    movzwl %ax, %eax
+; CHECK-FMA-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fmul_pow_shl_cnt_safe:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e64 v0, v0, 1
+; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; VI-NEXT:    s_mov_b32 s4, 0xff5f3992
+; VI-NEXT:    s_mov_b32 s5, 0x7befffff
+; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_pow_shl_cnt_safe:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b16 v0, v0, 1
+; GFX10-NEXT:    s_mov_b32 s4, 0xff5f3992
+; GFX10-NEXT:    s_mov_b32 s5, 0x7befffff
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_pow_shl_cnt_safe:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b16 v0, v0, 1
+; GFX11-NEXT:    s_mov_b32 s0, 0xff5f3992
+; GFX11-NEXT:    s_mov_b32 s1, 0x7befffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nuw i16 1, %cnt
+  %conv = uitofp i16 %shl to double
+  %mul = fmul double 9.745314e+288, %conv
+  ret double %mul
+}
+
+define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
+; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_vec:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1,1]
+; CHECK-SSE-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-SSE-NEXT:    psllq %xmm0, %xmm2
+; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; CHECK-SSE-NEXT:    psllq %xmm0, %xmm1
+; CHECK-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; CHECK-SSE-NEXT:    movapd {{.*#+}} xmm0 = [4294967295,4294967295]
+; CHECK-SSE-NEXT:    andpd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    psrlq $32, %xmm1
+; CHECK-SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT:    addpd %xmm0, %xmm1
+; CHECK-SSE-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,1.0E+0]
+; CHECK-SSE-NEXT:    divpd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_vec:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [1,1]
+; CHECK-AVX2-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; CHECK-AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
+; CHECK-AVX2-NEXT:    # xmm1 = mem[0,0]
+; CHECK-AVX2-NEXT:    vdivpd %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_vec:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [1,1]
+; CHECK-NO-FASTFMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NO-FASTFMA-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; CHECK-NO-FASTFMA-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-NO-FASTFMA-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
+; CHECK-NO-FASTFMA-NEXT:    # xmm1 = mem[0,0]
+; CHECK-NO-FASTFMA-NEXT:    vdivpd %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_vec:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [1,1]
+; CHECK-FMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    vcvtuqq2pd %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
+; CHECK-FMA-NEXT:    # xmm1 = mem[0,0]
+; CHECK-FMA-NEXT:    vdivpd %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fdiv_pow_shl_cnt_vec:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
+; VI-NEXT:    v_cvt_f64_u32_e32 v[3:4], v1
+; VI-NEXT:    v_lshlrev_b64 v[1:2], v2, 1
+; VI-NEXT:    v_cvt_f64_u32_e32 v[5:6], v2
+; VI-NEXT:    v_ldexp_f64 v[2:3], v[3:4], 32
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[5:6], 32
+; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], v0
+; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v1
+; VI-NEXT:    v_add_f64 v[2:3], v[2:3], v[6:7]
+; VI-NEXT:    v_add_f64 v[4:5], v[4:5], v[0:1]
+; VI-NEXT:    v_div_scale_f64 v[0:1], s[4:5], v[2:3], v[2:3], 1.0
+; VI-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], 1.0
+; VI-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[4:5], 1.0
+; VI-NEXT:    v_rcp_f64_e32 v[8:9], v[0:1]
+; VI-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
+; VI-NEXT:    v_fma_f64 v[12:13], -v[0:1], v[8:9], 1.0
+; VI-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; VI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-NEXT:    v_div_scale_f64 v[12:13], vcc, 1.0, v[2:3], 1.0
+; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-NEXT:    v_fma_f64 v[14:15], -v[0:1], v[8:9], 1.0
+; VI-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0
+; VI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9]
+; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11]
+; VI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[8:9]
+; VI-NEXT:    v_mul_f64 v[18:19], v[16:17], v[10:11]
+; VI-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[14:15], v[12:13]
+; VI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17]
+; VI-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[8:9], v[14:15]
+; VI-NEXT:    s_mov_b64 vcc, s[4:5]
+; VI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
+; VI-NEXT:    v_div_fixup_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[4:5], 1.0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fdiv_pow_shl_cnt_vec:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 1
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
+; GFX10-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
+; GFX10-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
+; GFX10-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
+; GFX10-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
+; GFX10-NEXT:    v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0
+; GFX10-NEXT:    v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0
+; GFX10-NEXT:    v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0
+; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
+; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
+; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GFX10-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GFX10-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX10-NEXT:    v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX10-NEXT:    v_mul_f64 v[14:15], v[16:17], v[8:9]
+; GFX10-NEXT:    v_mul_f64 v[18:19], v[12:13], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17]
+; GFX10-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13]
+; GFX10-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15]
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
+; GFX10-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
+; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
+; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fdiv_pow_shl_cnt_vec:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
+; GFX11-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
+; GFX11-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_div_scale_f64 v[4:5], null, v[0:1], v[0:1], 1.0
+; GFX11-NEXT:    v_div_scale_f64 v[6:7], null, v[2:3], v[2:3], 1.0
+; GFX11-NEXT:    v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
+; GFX11-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GFX11-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX11-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GFX11-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX11-NEXT:    v_div_scale_f64 v[12:13], s0, 1.0, v[2:3], 1.0
+; GFX11-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f64 v[14:15], v[16:17], v[8:9]
+; GFX11-NEXT:    v_mul_f64 v[18:19], v[12:13], v[10:11]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17]
+; GFX11-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15]
+; GFX11-NEXT:    s_mov_b32 vcc_lo, s0
+; GFX11-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
+; GFX11-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nuw <2 x i64> <i64 1, i64 1>, %cnt
+  %conv = uitofp <2 x i64> %shl to <2 x double>
+  %mul = fdiv <2 x double> <double 1.000000e+00, double 1.000000e+00>, %conv
+  ret <2 x double> %mul
+}
+
+define <2 x float> @fdiv_pow_shl_cnt_vec_with_expensive_cast(<2 x i64> %cnt) nounwind {
+; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [1,1]
+; CHECK-SSE-NEXT:    movdqa %xmm3, %xmm2
+; CHECK-SSE-NEXT:    psllq %xmm1, %xmm2
+; CHECK-SSE-NEXT:    psllq %xmm0, %xmm3
+; CHECK-SSE-NEXT:    movq %xmm3, %rax
+; CHECK-SSE-NEXT:    testq %rax, %rax
+; CHECK-SSE-NEXT:    js .LBB15_1
+; CHECK-SSE-NEXT:  # %bb.2:
+; CHECK-SSE-NEXT:    xorps %xmm1, %xmm1
+; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
+; CHECK-SSE-NEXT:    jmp .LBB15_3
+; CHECK-SSE-NEXT:  .LBB15_1:
+; CHECK-SSE-NEXT:    movq %rax, %rcx
+; CHECK-SSE-NEXT:    shrq %rcx
+; CHECK-SSE-NEXT:    andl $1, %eax
+; CHECK-SSE-NEXT:    orq %rcx, %rax
+; CHECK-SSE-NEXT:    xorps %xmm1, %xmm1
+; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
+; CHECK-SSE-NEXT:    addss %xmm1, %xmm1
+; CHECK-SSE-NEXT:  .LBB15_3:
+; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; CHECK-SSE-NEXT:    movq %xmm0, %rax
+; CHECK-SSE-NEXT:    testq %rax, %rax
+; CHECK-SSE-NEXT:    js .LBB15_4
+; CHECK-SSE-NEXT:  # %bb.5:
+; CHECK-SSE-NEXT:    xorps %xmm0, %xmm0
+; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm0
+; CHECK-SSE-NEXT:    jmp .LBB15_6
+; CHECK-SSE-NEXT:  .LBB15_4:
+; CHECK-SSE-NEXT:    movq %rax, %rcx
+; CHECK-SSE-NEXT:    shrq %rcx
+; CHECK-SSE-NEXT:    andl $1, %eax
+; CHECK-SSE-NEXT:    orq %rcx, %rax
+; CHECK-SSE-NEXT:    xorps %xmm0, %xmm0
+; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm0
+; CHECK-SSE-NEXT:    addss %xmm0, %xmm0
+; CHECK-SSE-NEXT:  .LBB15_6:
+; CHECK-SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-SSE-NEXT:    movaps {{.*#+}} xmm0 = <1.0E+0,1.0E+0,u,u>
+; CHECK-SSE-NEXT:    divps %xmm1, %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [1,1]
+; CHECK-AVX2-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsrlq $1, %xmm0, %xmm2
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; CHECK-AVX2-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; CHECK-AVX2-NEXT:    vmovq %xmm1, %rax
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
+; CHECK-AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
+; CHECK-AVX2-NEXT:    vaddps %xmm1, %xmm1, %xmm2
+; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0
+; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; CHECK-AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-AVX2-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [1,1]
+; CHECK-NO-FASTFMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vpextrq $1, %xmm0, %rax
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm1
+; CHECK-NO-FASTFMA-NEXT:    vmovq %xmm0, %rax
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NO-FASTFMA-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [1,1]
+; CHECK-FMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    vcvtuqq2ps %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-FMA-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b64 v[1:2], v2, 1
+; VI-NEXT:    v_ffbh_u32_e32 v3, v2
+; VI-NEXT:    v_min_u32_e32 v5, 32, v3
+; VI-NEXT:    v_lshlrev_b64 v[1:2], v5, v[1:2]
+; VI-NEXT:    v_lshlrev_b64 v[3:4], v0, 1
+; VI-NEXT:    v_min_u32_e32 v0, 1, v1
+; VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-NEXT:    v_cvt_f32_u32_e32 v2, v0
+; VI-NEXT:    v_ffbh_u32_e32 v0, v4
+; VI-NEXT:    v_min_u32_e32 v6, 32, v0
+; VI-NEXT:    v_lshlrev_b64 v[0:1], v6, v[3:4]
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, 32, v5
+; VI-NEXT:    v_min_u32_e32 v0, 1, v0
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; VI-NEXT:    v_ldexp_f32 v1, v2, v3
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
+; VI-NEXT:    v_ldexp_f32 v0, v0, v2
+; VI-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; VI-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; VI-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; VI-NEXT:    v_div_scale_f32 v5, s[4:5], 1.0, v1, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v6, v2
+; VI-NEXT:    v_rcp_f32_e32 v7, v3
+; VI-NEXT:    v_fma_f32 v8, -v2, v6, 1.0
+; VI-NEXT:    v_fma_f32 v6, v8, v6, v6
+; VI-NEXT:    v_mul_f32_e32 v8, v4, v6
+; VI-NEXT:    v_fma_f32 v10, -v2, v8, v4
+; VI-NEXT:    v_fma_f32 v9, -v3, v7, 1.0
+; VI-NEXT:    v_fma_f32 v8, v10, v6, v8
+; VI-NEXT:    v_fma_f32 v2, -v2, v8, v4
+; VI-NEXT:    v_fma_f32 v4, v9, v7, v7
+; VI-NEXT:    v_div_fmas_f32 v2, v2, v6, v8
+; VI-NEXT:    v_mul_f32_e32 v6, v5, v4
+; VI-NEXT:    v_fma_f32 v7, -v3, v6, v5
+; VI-NEXT:    v_fma_f32 v6, v7, v4, v6
+; VI-NEXT:    v_fma_f32 v3, -v3, v6, v5
+; VI-NEXT:    s_mov_b64 vcc, s[4:5]
+; VI-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; VI-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
+; VI-NEXT:    v_div_fixup_f32 v1, v3, v1, 1.0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 1
+; GFX10-NEXT:    v_ffbh_u32_e32 v4, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v5, v3
+; GFX10-NEXT:    v_min_u32_e32 v4, 32, v4
+; GFX10-NEXT:    v_min_u32_e32 v5, 32, v5
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
+; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 32, v4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX10-NEXT:    v_div_scale_f32 v2, s4, v0, v0, 1.0
+; GFX10-NEXT:    v_div_scale_f32 v3, s4, v1, v1, 1.0
+; GFX10-NEXT:    v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0
+; GFX10-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX10-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX10-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
+; GFX10-NEXT:    v_fma_f32 v7, -v3, v5, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v4, v6, v4
+; GFX10-NEXT:    v_div_scale_f32 v6, s4, 1.0, v1, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v5, v7, v5
+; GFX10-NEXT:    v_mul_f32_e32 v7, v8, v4
+; GFX10-NEXT:    v_mul_f32_e32 v9, v6, v5
+; GFX10-NEXT:    v_fma_f32 v10, -v2, v7, v8
+; GFX10-NEXT:    v_fma_f32 v11, -v3, v9, v6
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v10, v4
+; GFX10-NEXT:    v_fmac_f32_e32 v9, v11, v5
+; GFX10-NEXT:    v_fma_f32 v2, -v2, v7, v8
+; GFX10-NEXT:    v_fma_f32 v3, -v3, v9, v6
+; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v4, v7
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
+; GFX10-NEXT:    v_div_fmas_f32 v3, v3, v5, v9
+; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX10-NEXT:    v_div_fixup_f32 v1, v3, v1, 1.0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_clz_i32_u32_e32 v4, v1
+; GFX11-NEXT:    v_clz_i32_u32_e32 v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_u32_e32 v4, 32, v4
+; GFX11-NEXT:    v_min_u32_e32 v5, 32, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 32, v4
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_div_scale_f32 v2, null, v0, v0, 1.0
+; GFX11-NEXT:    v_div_scale_f32 v3, null, v1, v1, 1.0
+; GFX11-NEXT:    v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX11-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
+; GFX11-NEXT:    v_fma_f32 v7, -v3, v5, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_fmac_f32 v4, v6, v4 :: v_dual_fmac_f32 v5, v7, v5
+; GFX11-NEXT:    v_div_scale_f32 v6, s0, 1.0, v1, 1.0
+; GFX11-NEXT:    v_mul_f32_e32 v7, v8, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e32 v9, v6, v5
+; GFX11-NEXT:    v_fma_f32 v10, -v2, v7, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_f32 v11, -v3, v9, v6
+; GFX11-NEXT:    v_fmac_f32_e32 v7, v10, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fmac_f32_e32 v9, v11, v5
+; GFX11-NEXT:    v_fma_f32 v2, -v2, v7, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_f32 v3, -v3, v9, v6
+; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v4, v7
+; GFX11-NEXT:    s_mov_b32 vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fmas_f32 v3, v3, v5, v9
+; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f32 v1, v3, v1, 1.0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nuw <2 x i64> <i64 1, i64 1>, %cnt
+  %conv = uitofp <2 x i64> %shl to <2 x float>
+  %mul = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %conv
+  ret <2 x float> %mul
+}
+
+define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
+; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    movq %rdi, %rcx
+; CHECK-SSE-NEXT:    movl $8, %eax
+; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-SSE-NEXT:    shlq %cl, %rax
+; CHECK-SSE-NEXT:    testq %rax, %rax
+; CHECK-SSE-NEXT:    js .LBB16_1
+; CHECK-SSE-NEXT:  # %bb.2:
+; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
+; CHECK-SSE-NEXT:    jmp .LBB16_3
+; CHECK-SSE-NEXT:  .LBB16_1:
+; CHECK-SSE-NEXT:    shrq %rax
+; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
+; CHECK-SSE-NEXT:    addss %xmm1, %xmm1
+; CHECK-SSE-NEXT:  .LBB16_3:
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    divss %xmm1, %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    movq %rdi, %rcx
+; CHECK-AVX2-NEXT:    movl $8, %eax
+; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-AVX2-NEXT:    shlq %cl, %rax
+; CHECK-AVX2-NEXT:    testq %rax, %rax
+; CHECK-AVX2-NEXT:    js .LBB16_1
+; CHECK-AVX2-NEXT:  # %bb.2:
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    jmp .LBB16_3
+; CHECK-AVX2-NEXT:  .LBB16_1:
+; CHECK-AVX2-NEXT:    shrq %rax
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:  .LBB16_3:
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
+; CHECK-NO-FASTFMA-NEXT:    movl $8, %eax
+; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %rax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    movl $8, %eax
+; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
+; CHECK-FMA-NEXT:    vcvtusi2ss %rax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
+; VI-NEXT:    s_mov_b32 s6, 0xc1100000
+; VI-NEXT:    v_ffbh_u32_e32 v2, v1
+; VI-NEXT:    v_min_u32_e32 v2, 32, v2
+; VI-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; VI-NEXT:    v_min_u32_e32 v0, 1, v0
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
+; VI-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
+; VI-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
+; VI-NEXT:    v_rcp_f32_e32 v3, v1
+; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
+; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
+; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
+; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
+; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
+; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
+; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
+; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX10-NEXT:    v_div_scale_f32 v1, s4, v0, v0, 0xc1100000
+; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX10-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000
+; GFX10-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX10-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX10-NEXT:    v_fmac_f32_e32 v4, v5, v2
+; GFX10-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, 0xc1100000
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v1
+; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, 0xc1100000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX11-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v5, v2
+; GFX11-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 0xc1100000
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl i64 8, %cnt
+  %conv = uitofp i64 %shl to float
+  %mul = fdiv float -9.000000e+00, %conv
+  ret float %mul
+}
+
+define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
+; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    movq %rdi, %rcx
+; CHECK-SSE-NEXT:    movl $8, %eax
+; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-SSE-NEXT:    shlq %cl, %rax
+; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    divss %xmm1, %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    movq %rdi, %rcx
+; CHECK-AVX2-NEXT:    movl $8, %eax
+; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-AVX2-NEXT:    shlq %cl, %rax
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
+; CHECK-NO-FASTFMA-NEXT:    movl $8, %eax
+; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
+; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    movl $8, %eax
+; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
+; CHECK-FMA-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
+; VI-NEXT:    s_mov_b32 s6, 0xc1100000
+; VI-NEXT:    v_xor_b32_e32 v2, v0, v1
+; VI-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; VI-NEXT:    v_ffbh_i32_e32 v3, v1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
+; VI-NEXT:    v_min_u32_e32 v2, v3, v2
+; VI-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; VI-NEXT:    v_min_u32_e32 v0, 1, v0
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
+; VI-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
+; VI-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
+; VI-NEXT:    v_rcp_f32_e32 v3, v1
+; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
+; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
+; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
+; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
+; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
+; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
+; GFX10-NEXT:    v_xor_b32_e32 v2, v0, v1
+; GFX10-NEXT:    v_ffbh_i32_e32 v3, v1
+; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, -1, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 32, v2
+; GFX10-NEXT:    v_min_u32_e32 v2, v3, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX10-NEXT:    v_div_scale_f32 v1, s4, v0, v0, 0xc1100000
+; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX10-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000
+; GFX10-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX10-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX10-NEXT:    v_fmac_f32_e32 v4, v5, v2
+; GFX10-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, 0xc1100000
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_xor_b32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cls_i32_e32 v3, v1
+; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, -1, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 32, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u32_e32 v2, v3, v2
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, 0xc1100000
+; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000
+; GFX11-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 0xc1100000
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl i64 8, %cnt
+  %conv = sitofp i64 %shl to float
+  %mul = fdiv float -9.000000e+00, %conv
+  ret float %mul
+}
+
+define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
+; CHECK-SSE-LABEL: fdiv_pow_shl_cnt:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    movq %rdi, %rcx
+; CHECK-SSE-NEXT:    andb $31, %cl
+; CHECK-SSE-NEXT:    movl $8, %eax
+; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-SSE-NEXT:    shlq %cl, %rax
+; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    divss %xmm1, %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    movq %rdi, %rcx
+; CHECK-AVX2-NEXT:    andb $31, %cl
+; CHECK-AVX2-NEXT:    movl $8, %eax
+; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-AVX2-NEXT:    shlq %cl, %rax
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
+; CHECK-NO-FASTFMA-NEXT:    andb $31, %cl
+; CHECK-NO-FASTFMA-NEXT:    movl $8, %eax
+; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
+; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fdiv_pow_shl_cnt:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    andb $31, %dil
+; CHECK-FMA-NEXT:    movl $8, %eax
+; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
+; CHECK-FMA-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fdiv_pow_shl_cnt:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v0, 31, v0
+; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
+; VI-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
+; VI-NEXT:    v_ffbh_i32_e32 v3, v1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
+; VI-NEXT:    v_min_u32_e32 v2, v3, v2
+; VI-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; VI-NEXT:    v_min_u32_e32 v0, 1, v0
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
+; VI-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, -0.5
+; VI-NEXT:    v_div_scale_f32 v2, vcc, -0.5, v0, -0.5
+; VI-NEXT:    v_rcp_f32_e32 v3, v1
+; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
+; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
+; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
+; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
+; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
+; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, -0.5
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fdiv_pow_shl_cnt:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 31, v0
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
+; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
+; GFX10-NEXT:    v_ffbh_i32_e32 v3, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 32, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, -1, v3
+; GFX10-NEXT:    v_min_u32_e32 v2, v3, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX10-NEXT:    v_div_scale_f32 v1, s4, v0, v0, -0.5
+; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX10-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, -0.5, v0, -0.5
+; GFX10-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX10-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX10-NEXT:    v_fmac_f32_e32 v4, v5, v2
+; GFX10-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, -0.5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fdiv_pow_shl_cnt:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 31, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
+; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cls_i32_e32 v3, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 32, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, -1, v3
+; GFX11-NEXT:    v_min_u32_e32 v2, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, -0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, -0.5, v0, -0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX11-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v5, v2
+; GFX11-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, -0.5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cnt = and i64 %cnt_in, 31
+  %shl = shl i64 8, %cnt
+  %conv = sitofp i64 %shl to float
+  %mul = fdiv float -0.500000e+00, %conv
+  ret float %mul
+}
+
+define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
+; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    pushq %rax
+; CHECK-SSE-NEXT:    movl %edi, %ecx
+; CHECK-SSE-NEXT:    movl $1, %eax
+; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-SSE-NEXT:    shll %cl, %eax
+; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm0
+; CHECK-SSE-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-SSE-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    divss %xmm0, %xmm1
+; CHECK-SSE-NEXT:    movaps %xmm1, %xmm0
+; CHECK-SSE-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-SSE-NEXT:    popq %rax
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    pushq %rax
+; CHECK-AVX2-NEXT:    movl %edi, %ecx
+; CHECK-AVX2-NEXT:    movl $1, %eax
+; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-AVX2-NEXT:    shll %cl, %eax
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-AVX2-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-AVX2-NEXT:    popq %rax
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
+; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
+; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmovd %xmm0, %eax
+; CHECK-NO-FASTFMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    movl $1, %eax
+; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
+; CHECK-FMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmovd %xmm0, %eax
+; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; VI-NEXT:    s_movk_i32 s4, 0x7000
+; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; VI-NEXT:    v_rcp_f32_e32 v1, v1
+; VI-NEXT:    v_mul_f32_e32 v1, 0x46000000, v1
+; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; VI-NEXT:    v_div_fixup_f16 v0, v1, v0, s4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; GFX10-NEXT:    s_mov_b32 s4, 0x46000000
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX10-NEXT:    v_fma_mixlo_f16 v1, v1, s4, 0
+; GFX10-NEXT:    v_div_fixup_f16 v0, v1, v0, 0x7000
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; GFX11-NEXT:    s_mov_b32 s0, 0x46000000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_fma_mixlo_f16 v1, v1, s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_div_fixup_f16 v0, v1, v0, 0x7000
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nuw i32 1, %cnt
+  %conv = uitofp i32 %shl to half
+  %mul = fdiv half 0xH7000, %conv
+  ret half %mul
+}
+
+define half @fdiv_pow_shl_cnt_in_bounds(i16 %cnt) nounwind {
+; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_in_bounds:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    pushq %rax
+; CHECK-SSE-NEXT:    movl %edi, %ecx
+; CHECK-SSE-NEXT:    movl $1, %eax
+; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-SSE-NEXT:    shll %cl, %eax
+; CHECK-SSE-NEXT:    movzwl %ax, %eax
+; CHECK-SSE-NEXT:    cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-SSE-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    divss %xmm0, %xmm1
+; CHECK-SSE-NEXT:    movaps %xmm1, %xmm0
+; CHECK-SSE-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-SSE-NEXT:    popq %rax
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_in_bounds:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    pushq %rax
+; CHECK-AVX2-NEXT:    movl %edi, %ecx
+; CHECK-AVX2-NEXT:    movl $1, %eax
+; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-AVX2-NEXT:    shll %cl, %eax
+; CHECK-AVX2-NEXT:    movzwl %ax, %eax
+; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-AVX2-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-AVX2-NEXT:    popq %rax
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_in_bounds:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
+; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
+; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
+; CHECK-NO-FASTFMA-NEXT:    movzwl %ax, %eax
+; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmovd %xmm0, %eax
+; CHECK-NO-FASTFMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_in_bounds:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    movl $1, %eax
+; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
+; CHECK-FMA-NEXT:    movzwl %ax, %eax
+; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmovd %xmm0, %eax
+; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fdiv_pow_shl_cnt_in_bounds:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e64 v0, v0, 1
+; VI-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; VI-NEXT:    s_movk_i32 s4, 0x7000
+; VI-NEXT:    v_rcp_f32_e32 v1, v1
+; VI-NEXT:    v_mul_f32_e32 v1, 0x46000000, v1
+; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; VI-NEXT:    v_div_fixup_f16 v0, v1, v0, s4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fdiv_pow_shl_cnt_in_bounds:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b16 v0, v0, 1
+; GFX10-NEXT:    s_mov_b32 s4, 0x46000000
+; GFX10-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX10-NEXT:    v_fma_mixlo_f16 v1, v1, s4, 0
+; GFX10-NEXT:    v_div_fixup_f16 v0, v1, v0, 0x7000
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fdiv_pow_shl_cnt_in_bounds:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b16 v0, v0, 1
+; GFX11-NEXT:    s_mov_b32 s0, 0x46000000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_fma_mixlo_f16 v1, v1, s0, 0
+; GFX11-NEXT:    v_div_fixup_f16 v0, v1, v0, 0x7000
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nuw i16 1, %cnt
+  %conv = uitofp i16 %shl to half
+  %mul = fdiv half 0xH7000, %conv
+  ret half %mul
+}
+
+define half @fdiv_pow_shl_cnt_in_bounds2(i16 %cnt) nounwind {
+; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_in_bounds2:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    pushq %rax
+; CHECK-SSE-NEXT:    movl %edi, %ecx
+; CHECK-SSE-NEXT:    movl $1, %eax
+; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-SSE-NEXT:    shll %cl, %eax
+; CHECK-SSE-NEXT:    movzwl %ax, %eax
+; CHECK-SSE-NEXT:    cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-SSE-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    divss %xmm0, %xmm1
+; CHECK-SSE-NEXT:    movaps %xmm1, %xmm0
+; CHECK-SSE-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-SSE-NEXT:    popq %rax
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_in_bounds2:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    pushq %rax
+; CHECK-AVX2-NEXT:    movl %edi, %ecx
+; CHECK-AVX2-NEXT:    movl $1, %eax
+; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-AVX2-NEXT:    shll %cl, %eax
+; CHECK-AVX2-NEXT:    movzwl %ax, %eax
+; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-AVX2-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-AVX2-NEXT:    popq %rax
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_in_bounds2:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
+; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
+; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
+; CHECK-NO-FASTFMA-NEXT:    movzwl %ax, %eax
+; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmovd %xmm0, %eax
+; CHECK-NO-FASTFMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_in_bounds2:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    movl $1, %eax
+; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
+; CHECK-FMA-NEXT:    movzwl %ax, %eax
+; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmovd %xmm0, %eax
+; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fdiv_pow_shl_cnt_in_bounds2:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e64 v0, v0, 1
+; VI-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; VI-NEXT:    s_movk_i32 s4, 0x4800
+; VI-NEXT:    v_rcp_f32_e32 v1, v1
+; VI-NEXT:    v_mul_f32_e32 v1, 0x41000000, v1
+; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; VI-NEXT:    v_div_fixup_f16 v0, v1, v0, s4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fdiv_pow_shl_cnt_in_bounds2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b16 v0, v0, 1
+; GFX10-NEXT:    s_mov_b32 s4, 0x41000000
+; GFX10-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX10-NEXT:    v_fma_mixlo_f16 v1, v1, s4, 0
+; GFX10-NEXT:    v_div_fixup_f16 v0, v1, v0, 0x4800
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fdiv_pow_shl_cnt_in_bounds2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b16 v0, v0, 1
+; GFX11-NEXT:    s_mov_b32 s0, 0x41000000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_fma_mixlo_f16 v1, v1, s0, 0
+; GFX11-NEXT:    v_div_fixup_f16 v0, v1, v0, 0x4800
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nuw i16 1, %cnt
+  %conv = uitofp i16 %shl to half
+  %mul = fdiv half 0xH4800, %conv
+  ret half %mul
+}
+
+define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
+; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    pushq %rax
+; CHECK-SSE-NEXT:    movl %edi, %ecx
+; CHECK-SSE-NEXT:    movl $1, %eax
+; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-SSE-NEXT:    shll %cl, %eax
+; CHECK-SSE-NEXT:    movzwl %ax, %eax
+; CHECK-SSE-NEXT:    cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-SSE-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    divss %xmm0, %xmm1
+; CHECK-SSE-NEXT:    movaps %xmm1, %xmm0
+; CHECK-SSE-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-SSE-NEXT:    popq %rax
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    pushq %rax
+; CHECK-AVX2-NEXT:    movl %edi, %ecx
+; CHECK-AVX2-NEXT:    movl $1, %eax
+; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-AVX2-NEXT:    shll %cl, %eax
+; CHECK-AVX2-NEXT:    movzwl %ax, %eax
+; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-AVX2-NEXT:    callq __extendhfsf2 at PLT
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    callq __truncsfhf2 at PLT
+; CHECK-AVX2-NEXT:    popq %rax
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
+; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
+; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
+; CHECK-NO-FASTFMA-NEXT:    movzwl %ax, %eax
+; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmovd %xmm0, %eax
+; CHECK-NO-FASTFMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    movl $1, %eax
+; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
+; CHECK-FMA-NEXT:    movzwl %ax, %eax
+; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmovd %xmm0, %eax
+; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e64 v0, v0, 1
+; VI-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; VI-NEXT:    v_rcp_f32_e32 v1, v1
+; VI-NEXT:    v_add_f32_e32 v1, v1, v1
+; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; VI-NEXT:    v_div_fixup_f16 v0, v1, v0, 2.0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b16 v0, v0, 1
+; GFX10-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT:    v_div_fixup_f16 v0, v1, v0, 2.0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b16 v0, v0, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v1
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_div_fixup_f16 v0, v1, v0, 2.0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nuw i16 1, %cnt
+  %conv = uitofp i16 %shl to half
+  %mul = fdiv half 0xH4000, %conv
+  ret half %mul
+}
+
+define double @fdiv_pow_shl_cnt32_to_dbl_okay(i32 %cnt) nounwind {
+; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    movl %edi, %ecx
+; CHECK-SSE-NEXT:    movl $1, %eax
+; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-SSE-NEXT:    shll %cl, %eax
+; CHECK-SSE-NEXT:    cvtsi2sd %rax, %xmm1
+; CHECK-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-NEXT:    divsd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    movl %edi, %ecx
+; CHECK-AVX2-NEXT:    movl $1, %eax
+; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-AVX2-NEXT:    shll %cl, %eax
+; CHECK-AVX2-NEXT:    vcvtsi2sd %rax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-AVX2-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
+; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
+; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2sd %eax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NO-FASTFMA-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    movl $1, %eax
+; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
+; CHECK-FMA-NEXT:    vcvtusi2sd %eax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    s_mov_b32 s5, 0x36a00000
+; VI-NEXT:    v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-NEXT:    v_div_scale_f64 v[6:7], vcc, s[4:5], v[0:1], s[4:5]
+; VI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], s[4:5]
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s5, 0x36a00000
+; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX10-NEXT:    v_div_scale_f64 v[2:3], s6, v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, s[4:5], v[0:1], s[4:5]
+; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX10-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX10-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX10-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
+; GFX10-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
+; GFX10-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s1, 0x36a00000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-NEXT:    v_div_scale_f64 v[2:3], null, v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, s[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX11-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX11-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
+; GFX11-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; GFX11-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], s[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nuw i32 1, %cnt
+  %conv = uitofp i32 %shl to double
+  %mul = fdiv double 0x36A0000000000000, %conv
+  ret double %mul
+}
+
+define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
+; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    movl %edi, %ecx
+; CHECK-SSE-NEXT:    movl $1, %eax
+; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-SSE-NEXT:    shll %cl, %eax
+; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    divss %xmm1, %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    movl %edi, %ecx
+; CHECK-AVX2-NEXT:    movl $1, %eax
+; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-AVX2-NEXT:    shll %cl, %eax
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
+; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
+; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    movl $1, %eax
+; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
+; CHECK-FMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; VI-NEXT:    s_mov_b32 s6, 0x10fffff8
+; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
+; VI-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
+; VI-NEXT:    v_rcp_f32_e32 v3, v1
+; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
+; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
+; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
+; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
+; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
+; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_div_scale_f32 v1, s4, v0, v0, 0x10fffff8
+; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX10-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, 0x10fffff8, v0, 0x10fffff8
+; GFX10-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX10-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX10-NEXT:    v_fmac_f32_e32 v4, v5, v2
+; GFX10-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, 0x10fffff8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, 0x10fffff8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, 0x10fffff8, v0, 0x10fffff8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX11-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v5, v2
+; GFX11-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 0x10fffff8
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nuw i32 1, %cnt
+  %conv = uitofp i32 %shl to float
+  %mul = fdiv float 0x3a1fffff00000000, %conv
+  ret float %mul
+}
+
+define float @fdiv_pow_shl_cnt32_okay(i32 %cnt) nounwind {
+; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_okay:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    movl %edi, %ecx
+; CHECK-SSE-NEXT:    movl $1, %eax
+; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-SSE-NEXT:    shll %cl, %eax
+; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    divss %xmm1, %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt32_okay:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    movl %edi, %ecx
+; CHECK-AVX2-NEXT:    movl $1, %eax
+; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-AVX2-NEXT:    shll %cl, %eax
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt32_okay:
+; CHECK-NO-FASTFMA:       # %bb.0:
+; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
+; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
+; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    retq
+;
+; CHECK-FMA-LABEL: fdiv_pow_shl_cnt32_okay:
+; CHECK-FMA:       # %bb.0:
+; CHECK-FMA-NEXT:    movl $1, %eax
+; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
+; CHECK-FMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT:    retq
+; VI-LABEL: fdiv_pow_shl_cnt32_okay:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; VI-NEXT:    s_mov_b32 s6, 0x11000000
+; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
+; VI-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
+; VI-NEXT:    v_rcp_f32_e32 v3, v1
+; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
+; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
+; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
+; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
+; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
+; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fdiv_pow_shl_cnt32_okay:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_div_scale_f32 v1, s4, v0, v0, 0x11000000
+; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX10-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, 0x11000000, v0, 0x11000000
+; GFX10-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX10-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX10-NEXT:    v_fmac_f32_e32 v4, v5, v2
+; GFX10-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, 0x11000000
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fdiv_pow_shl_cnt32_okay:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, 0x11000000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, 0x11000000, v0, 0x11000000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX11-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v5, v2
+; GFX11-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 0x11000000
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %shl = shl nuw i32 1, %cnt
+  %conv = uitofp i32 %shl to float
+  %mul = fdiv float 0x3a20000000000000, %conv
+  ret float %mul
+}


        


More information about the llvm-commits mailing list