[llvm] [AMDGPU] Add all type for bitcast on VReg_512 (PR #131775)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 19 19:48:48 PDT 2025
https://github.com/Shoreshen updated https://github.com/llvm/llvm-project/pull/131775
>From 96502df19db2b436c58349bd96bce3a2c8c2671d Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Tue, 18 Mar 2025 18:10:20 +0800
Subject: [PATCH 1/5] Add all type for bitcast
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 45 ++------
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll | 127 +++++++++++++++++++++
2 files changed, 135 insertions(+), 37 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index de77401eb0137..2eee87068a3b8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1841,58 +1841,29 @@ def : BitConvert <v12i32, v12f32, VReg_384>;
def : BitConvert <v12f32, v12i32, VReg_384>;
// 512-bit bitcast
-def : BitConvert <v32f16, v32i16, VReg_512>;
-def : BitConvert <v32i16, v32f16, VReg_512>;
-def : BitConvert <v32f16, v16i32, VReg_512>;
-def : BitConvert <v32f16, v16f32, VReg_512>;
-def : BitConvert <v16f32, v32f16, VReg_512>;
-def : BitConvert <v16i32, v32f16, VReg_512>;
-def : BitConvert <v32i16, v16i32, VReg_512>;
-def : BitConvert <v32i16, v16f32, VReg_512>;
-def : BitConvert <v16f32, v32i16, VReg_512>;
-def : BitConvert <v16i32, v32i16, VReg_512>;
-def : BitConvert <v16i32, v16f32, VReg_512>;
-def : BitConvert <v16f32, v16i32, VReg_512>;
-def : BitConvert <v8i64, v8f64, VReg_512>;
-def : BitConvert <v8f64, v8i64, VReg_512>;
-def : BitConvert <v8i64, v16i32, VReg_512>;
-def : BitConvert <v8f64, v16i32, VReg_512>;
-def : BitConvert <v16i32, v8i64, VReg_512>;
-def : BitConvert <v16i32, v8f64, VReg_512>;
-def : BitConvert <v8i64, v16f32, VReg_512>;
-def : BitConvert <v8f64, v16f32, VReg_512>;
-def : BitConvert <v16f32, v8i64, VReg_512>;
-def : BitConvert <v16f32, v8f64, VReg_512>;
-
-
-
-def : BitConvert <v32bf16, v32i16, VReg_512>;
-def : BitConvert <v32i16, v32bf16, VReg_512>;
+foreach vt = VReg_512.RegTypes in {
+ foreach st = VReg_512.RegTypes in {
+ if !not(!eq (vt, st)) then {
+ def : BitConvert <vt, st, VReg_512>;
+ }
+ }
+}
+
def : BitConvert <v32bf16, v32i16, SReg_512>;
def : BitConvert <v32i16, v32bf16, SReg_512>;
-def : BitConvert <v32bf16, v32f16, VReg_512>;
-def : BitConvert <v32f16, v32bf16, VReg_512>;
def : BitConvert <v32bf16, v32f16, SReg_512>;
def : BitConvert <v32f16, v32bf16, SReg_512>;
-def : BitConvert <v32bf16, v16i32, VReg_512>;
-def : BitConvert <v16i32, v32bf16, VReg_512>;
def : BitConvert <v32bf16, v16i32, SReg_512>;
def : BitConvert <v16i32, v32bf16, SReg_512>;
-def : BitConvert <v32bf16, v16f32, VReg_512>;
-def : BitConvert <v16f32, v32bf16, VReg_512>;
def : BitConvert <v32bf16, v16f32, SReg_512>;
def : BitConvert <v16f32, v32bf16, SReg_512>;
-def : BitConvert <v32bf16, v8f64, VReg_512>;
-def : BitConvert <v8f64, v32bf16, VReg_512>;
def : BitConvert <v32bf16, v8f64, SReg_512>;
def : BitConvert <v8f64, v32bf16, SReg_512>;
-def : BitConvert <v32bf16, v8i64, VReg_512>;
-def : BitConvert <v8i64, v32bf16, VReg_512>;
def : BitConvert <v32bf16, v8i64, SReg_512>;
def : BitConvert <v8i64, v32bf16, SReg_512>;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
index 5065f57c67dfd..b36ade582c878 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
@@ -1946,6 +1946,133 @@ end:
ret void
}
+; CHECK-LABEL: {{^}}v_bitcast_v8i64_to_v32f16:
+define <32 x half> @v_bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) {
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+cmp.true:
+ %a1 = add <8 x i64> %a, splat (i64 3)
+ %a2 = bitcast <8 x i64> %a1 to <32 x half>
+ br label %end
+cmp.false:
+ %a3 = bitcast <8 x i64> %a to <32 x half>
+ br label %end
+end:
+ %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <32 x half> %phi
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v8i64_to_v32i16:
+define <32 x i16> @v_bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) {
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+cmp.true:
+ %a1 = add <8 x i64> %a, splat (i64 3)
+ %a2 = bitcast <8 x i64> %a1 to <32 x i16>
+ br label %end
+cmp.false:
+ %a3 = bitcast <8 x i64> %a to <32 x i16>
+ br label %end
+end:
+ %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <32 x i16> %phi
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v8f64_to_v32i16:
+define <32 x i16> @v_bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) {
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+cmp.true:
+ %a1 = fadd <8 x double> %a, splat (double 1.000000e+00)
+ %a2 = bitcast <8 x double> %a1 to <32 x i16>
+ br label %end
+cmp.false:
+ %a3 = bitcast <8 x double> %a to <32 x i16>
+ br label %end
+end:
+ %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <32 x i16> %phi
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v8f64_to_v32f16:
+define <32 x half> @v_bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) {
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+cmp.true:
+ %a1 = fadd <8 x double> %a, splat (double 1.000000e+00)
+ %a2 = bitcast <8 x double> %a1 to <32 x half>
+ br label %end
+cmp.false:
+ %a3 = bitcast <8 x double> %a to <32 x half>
+ br label %end
+end:
+ %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <32 x half> %phi
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v32f16_to_v8i64:
+define <8 x i64> @v_bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) {
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+cmp.true:
+ %a1 = fadd <32 x half> %a, splat (half 0xH0200)
+ %a2 = bitcast <32 x half> %a1 to <8 x i64>
+ br label %end
+cmp.false:
+ %a3 = bitcast <32 x half> %a to <8 x i64>
+ br label %end
+end:
+ %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <8 x i64> %phi
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v32f16_to_v8f64:
+define <8 x double> @v_bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) {
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+cmp.true:
+ %a1 = fadd <32 x half> %a, splat (half 0xH0200)
+ %a2 = bitcast <32 x half> %a1 to <8 x double>
+ br label %end
+cmp.false:
+ %a3 = bitcast <32 x half> %a to <8 x double>
+ br label %end
+end:
+ %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <8 x double> %phi
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v32i16_to_v8i64:
+define <8 x i64> @v_bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) {
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+cmp.true:
+ %a1 = add <32 x i16> %a, splat (i16 3)
+ %a2 = bitcast <32 x i16> %a1 to <8 x i64>
+ br label %end
+cmp.false:
+ %a3 = bitcast <32 x i16> %a to <8 x i64>
+ br label %end
+end:
+ %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <8 x i64> %phi
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v32i16_to_v8f64:
+define <8 x double> @v_bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) {
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+cmp.true:
+ %a1 = add <32 x i16> %a, splat (i16 3)
+ %a2 = bitcast <32 x i16> %a1 to <8 x double>
+ br label %end
+cmp.false:
+ %a3 = bitcast <32 x i16> %a to <8 x double>
+ br label %end
+end:
+ %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <8 x double> %phi
+}
>From 870854892dfe8508aa3b3278f7dc0c0f8d396ede Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Tue, 18 Mar 2025 20:57:02 +0800
Subject: [PATCH 2/5] fix comment
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 46 +++++++--
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll | 114 ---------------------
llvm/test/lit.cfg.py | 2 +-
3 files changed, 39 insertions(+), 123 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 2eee87068a3b8..09d04461da514 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1841,29 +1841,59 @@ def : BitConvert <v12i32, v12f32, VReg_384>;
def : BitConvert <v12f32, v12i32, VReg_384>;
// 512-bit bitcast
-foreach vt = VReg_512.RegTypes in {
- foreach st = VReg_512.RegTypes in {
- if !not(!eq (vt, st)) then {
- def : BitConvert <vt, st, VReg_512>;
- }
- }
-}
-
+// 512-bit bitcast
+def : BitConvert <v32f16, v32i16, VReg_512>;
+def : BitConvert <v32i16, v32f16, VReg_512>;
+def : BitConvert <v32f16, v16i32, VReg_512>;
+def : BitConvert <v32f16, v16f32, VReg_512>;
+def : BitConvert <v16f32, v32f16, VReg_512>;
+def : BitConvert <v16i32, v32f16, VReg_512>;
+def : BitConvert <v32i16, v16i32, VReg_512>;
+def : BitConvert <v32i16, v16f32, VReg_512>;
+def : BitConvert <v16f32, v32i16, VReg_512>;
+def : BitConvert <v16i32, v32i16, VReg_512>;
+def : BitConvert <v16i32, v16f32, VReg_512>;
+def : BitConvert <v16f32, v16i32, VReg_512>;
+def : BitConvert <v8i64, v8f64, VReg_512>;
+def : BitConvert <v8f64, v8i64, VReg_512>;
+def : BitConvert <v8i64, v16i32, VReg_512>;
+def : BitConvert <v8f64, v16i32, VReg_512>;
+def : BitConvert <v16i32, v8i64, VReg_512>;
+def : BitConvert <v16i32, v8f64, VReg_512>;
+def : BitConvert <v8i64, v16f32, VReg_512>;
+def : BitConvert <v8f64, v16f32, VReg_512>;
+def : BitConvert <v16f32, v8i64, VReg_512>;
+def : BitConvert <v16f32, v8f64, VReg_512>;
+def : BitConvert <v8i64, v32f16, VReg_512>;
+
+
+def : BitConvert <v32bf16, v32i16, VReg_512>;
+def : BitConvert <v32i16, v32bf16, VReg_512>;
def : BitConvert <v32bf16, v32i16, SReg_512>;
def : BitConvert <v32i16, v32bf16, SReg_512>;
+def : BitConvert <v32bf16, v32f16, VReg_512>;
+def : BitConvert <v32f16, v32bf16, VReg_512>;
def : BitConvert <v32bf16, v32f16, SReg_512>;
def : BitConvert <v32f16, v32bf16, SReg_512>;
+def : BitConvert <v32bf16, v16i32, VReg_512>;
+def : BitConvert <v16i32, v32bf16, VReg_512>;
def : BitConvert <v32bf16, v16i32, SReg_512>;
def : BitConvert <v16i32, v32bf16, SReg_512>;
+def : BitConvert <v32bf16, v16f32, VReg_512>;
+def : BitConvert <v16f32, v32bf16, VReg_512>;
def : BitConvert <v32bf16, v16f32, SReg_512>;
def : BitConvert <v16f32, v32bf16, SReg_512>;
+def : BitConvert <v32bf16, v8f64, VReg_512>;
+def : BitConvert <v8f64, v32bf16, VReg_512>;
def : BitConvert <v32bf16, v8f64, SReg_512>;
def : BitConvert <v8f64, v32bf16, SReg_512>;
+def : BitConvert <v32bf16, v8i64, VReg_512>;
+def : BitConvert <v8i64, v32bf16, VReg_512>;
def : BitConvert <v32bf16, v8i64, SReg_512>;
def : BitConvert <v8i64, v32bf16, SReg_512>;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
index b36ade582c878..1f94d9b6b3bda 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
@@ -1946,70 +1946,6 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v8i64_to_v32f16:
-define <32 x half> @v_bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) {
- %cmp = icmp eq i32 %b, 0
- br i1 %cmp, label %cmp.true, label %cmp.false
-cmp.true:
- %a1 = add <8 x i64> %a, splat (i64 3)
- %a2 = bitcast <8 x i64> %a1 to <32 x half>
- br label %end
-cmp.false:
- %a3 = bitcast <8 x i64> %a to <32 x half>
- br label %end
-end:
- %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
- ret <32 x half> %phi
-}
-
-; CHECK-LABEL: {{^}}v_bitcast_v8i64_to_v32i16:
-define <32 x i16> @v_bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) {
- %cmp = icmp eq i32 %b, 0
- br i1 %cmp, label %cmp.true, label %cmp.false
-cmp.true:
- %a1 = add <8 x i64> %a, splat (i64 3)
- %a2 = bitcast <8 x i64> %a1 to <32 x i16>
- br label %end
-cmp.false:
- %a3 = bitcast <8 x i64> %a to <32 x i16>
- br label %end
-end:
- %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
- ret <32 x i16> %phi
-}
-
-; CHECK-LABEL: {{^}}v_bitcast_v8f64_to_v32i16:
-define <32 x i16> @v_bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) {
- %cmp = icmp eq i32 %b, 0
- br i1 %cmp, label %cmp.true, label %cmp.false
-cmp.true:
- %a1 = fadd <8 x double> %a, splat (double 1.000000e+00)
- %a2 = bitcast <8 x double> %a1 to <32 x i16>
- br label %end
-cmp.false:
- %a3 = bitcast <8 x double> %a to <32 x i16>
- br label %end
-end:
- %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
- ret <32 x i16> %phi
-}
-
-; CHECK-LABEL: {{^}}v_bitcast_v8f64_to_v32f16:
-define <32 x half> @v_bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) {
- %cmp = icmp eq i32 %b, 0
- br i1 %cmp, label %cmp.true, label %cmp.false
-cmp.true:
- %a1 = fadd <8 x double> %a, splat (double 1.000000e+00)
- %a2 = bitcast <8 x double> %a1 to <32 x half>
- br label %end
-cmp.false:
- %a3 = bitcast <8 x double> %a to <32 x half>
- br label %end
-end:
- %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
- ret <32 x half> %phi
-}
-
; CHECK-LABEL: {{^}}v_bitcast_v32f16_to_v8i64:
define <8 x i64> @v_bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) {
%cmp = icmp eq i32 %b, 0
@@ -2026,56 +1962,6 @@ end:
ret <8 x i64> %phi
}
-; CHECK-LABEL: {{^}}v_bitcast_v32f16_to_v8f64:
-define <8 x double> @v_bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) {
- %cmp = icmp eq i32 %b, 0
- br i1 %cmp, label %cmp.true, label %cmp.false
-cmp.true:
- %a1 = fadd <32 x half> %a, splat (half 0xH0200)
- %a2 = bitcast <32 x half> %a1 to <8 x double>
- br label %end
-cmp.false:
- %a3 = bitcast <32 x half> %a to <8 x double>
- br label %end
-end:
- %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
- ret <8 x double> %phi
-}
-
-; CHECK-LABEL: {{^}}v_bitcast_v32i16_to_v8i64:
-define <8 x i64> @v_bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) {
- %cmp = icmp eq i32 %b, 0
- br i1 %cmp, label %cmp.true, label %cmp.false
-cmp.true:
- %a1 = add <32 x i16> %a, splat (i16 3)
- %a2 = bitcast <32 x i16> %a1 to <8 x i64>
- br label %end
-cmp.false:
- %a3 = bitcast <32 x i16> %a to <8 x i64>
- br label %end
-end:
- %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
- ret <8 x i64> %phi
-}
-
-; CHECK-LABEL: {{^}}v_bitcast_v32i16_to_v8f64:
-define <8 x double> @v_bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) {
- %cmp = icmp eq i32 %b, 0
- br i1 %cmp, label %cmp.true, label %cmp.false
-cmp.true:
- %a1 = add <32 x i16> %a, splat (i16 3)
- %a2 = bitcast <32 x i16> %a1 to <8 x double>
- br label %end
-cmp.false:
- %a3 = bitcast <32 x i16> %a to <8 x double>
- br label %end
-end:
- %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
- ret <8 x double> %phi
-}
-
-
-
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index aad7a088551b2..50921879cd1f2 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -466,7 +466,7 @@ def have_cxx_shared_library():
print("could not exec llvm-readobj")
return False
- readobj_out = readobj_cmd.stdout.read().decode("ascii")
+ readobj_out = readobj_cmd.stdout.read().decode("utf-8")
readobj_cmd.wait()
regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")
>From 5046309472c1e9a1524142657d30ace18093019a Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Tue, 18 Mar 2025 20:58:42 +0800
Subject: [PATCH 3/5] fix lit
---
llvm/test/lit.cfg.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 50921879cd1f2..aad7a088551b2 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -466,7 +466,7 @@ def have_cxx_shared_library():
print("could not exec llvm-readobj")
return False
- readobj_out = readobj_cmd.stdout.read().decode("utf-8")
+ readobj_out = readobj_cmd.stdout.read().decode("ascii")
readobj_cmd.wait()
regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")
>From c31f255f25f1720664f7d0f99ed7b211092da46b Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Tue, 18 Mar 2025 20:59:55 +0800
Subject: [PATCH 4/5] fix comment
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 09d04461da514..43ddccd8fb8c5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1840,7 +1840,6 @@ def : BitConvert <v12f32, v12i32, SReg_384>;
def : BitConvert <v12i32, v12f32, VReg_384>;
def : BitConvert <v12f32, v12i32, VReg_384>;
-// 512-bit bitcast
// 512-bit bitcast
def : BitConvert <v32f16, v32i16, VReg_512>;
def : BitConvert <v32i16, v32f16, VReg_512>;
>From aa1895b69f230311e7ebc53bd2b6c9a79dbda665 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Thu, 20 Mar 2025 10:48:31 +0800
Subject: [PATCH 5/5] add other bitconvert type for vreg512
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 7 +
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll | 2537 +++++++++++++++++++-
2 files changed, 2461 insertions(+), 83 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index cfe6065ea9221..3b38f620aab42 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1864,6 +1864,13 @@ def : BitConvert <v8f64, v16f32, VReg_512>;
def : BitConvert <v16f32, v8i64, VReg_512>;
def : BitConvert <v16f32, v8f64, VReg_512>;
def : BitConvert <v8i64, v32f16, VReg_512>;
+def : BitConvert <v8i64, v32i16, VReg_512>;
+def : BitConvert <v8f64, v32f16, VReg_512>;
+def : BitConvert <v8f64, v32i16, VReg_512>;
+def : BitConvert <v32f16, v8i64, VReg_512>;
+def : BitConvert <v32f16, v8f64, VReg_512>;
+def : BitConvert <v32i16, v8i64, VReg_512>;
+def : BitConvert <v32i16, v8f64, VReg_512>;
def : BitConvert <v32bf16, v32i16, VReg_512>;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
index 767113407c37b..19a09d973098a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
@@ -19467,16 +19467,2076 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v32f16_to_v8i64:
+define <32 x half> @v_bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) {
+; GCN-LABEL: v_bitcast_v8i64_to_v32f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v34, v15
+; GCN-NEXT: v_mov_b32_e32 v33, v14
+; GCN-NEXT: v_mov_b32_e32 v36, v13
+; GCN-NEXT: v_mov_b32_e32 v35, v12
+; GCN-NEXT: v_mov_b32_e32 v38, v11
+; GCN-NEXT: v_mov_b32_e32 v37, v10
+; GCN-NEXT: v_mov_b32_e32 v48, v9
+; GCN-NEXT: v_mov_b32_e32 v39, v8
+; GCN-NEXT: v_mov_b32_e32 v50, v7
+; GCN-NEXT: v_mov_b32_e32 v49, v6
+; GCN-NEXT: v_mov_b32_e32 v52, v5
+; GCN-NEXT: v_mov_b32_e32 v51, v4
+; GCN-NEXT: v_mov_b32_e32 v54, v3
+; GCN-NEXT: v_mov_b32_e32 v53, v2
+; GCN-NEXT: v_mov_b32_e32 v55, v1
+; GCN-NEXT: v_mov_b32_e32 v32, v0
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GCN-NEXT: ; implicit-def: $vgpr0
+; GCN-NEXT: ; implicit-def: $vgpr1
+; GCN-NEXT: ; implicit-def: $vgpr2
+; GCN-NEXT: ; implicit-def: $vgpr3
+; GCN-NEXT: ; implicit-def: $vgpr4
+; GCN-NEXT: ; implicit-def: $vgpr5
+; GCN-NEXT: ; implicit-def: $vgpr6
+; GCN-NEXT: ; implicit-def: $vgpr7
+; GCN-NEXT: ; implicit-def: $vgpr8
+; GCN-NEXT: ; implicit-def: $vgpr9
+; GCN-NEXT: ; implicit-def: $vgpr10
+; GCN-NEXT: ; implicit-def: $vgpr11
+; GCN-NEXT: ; implicit-def: $vgpr12
+; GCN-NEXT: ; implicit-def: $vgpr13
+; GCN-NEXT: ; implicit-def: $vgpr14
+; GCN-NEXT: ; implicit-def: $vgpr15
+; GCN-NEXT: ; implicit-def: $vgpr16
+; GCN-NEXT: ; implicit-def: $vgpr17
+; GCN-NEXT: ; implicit-def: $vgpr18
+; GCN-NEXT: ; implicit-def: $vgpr19
+; GCN-NEXT: ; implicit-def: $vgpr20
+; GCN-NEXT: ; implicit-def: $vgpr21
+; GCN-NEXT: ; implicit-def: $vgpr22
+; GCN-NEXT: ; implicit-def: $vgpr23
+; GCN-NEXT: ; implicit-def: $vgpr24
+; GCN-NEXT: ; implicit-def: $vgpr25
+; GCN-NEXT: ; implicit-def: $vgpr26
+; GCN-NEXT: ; implicit-def: $vgpr27
+; GCN-NEXT: ; implicit-def: $vgpr28
+; GCN-NEXT: ; implicit-def: $vgpr29
+; GCN-NEXT: ; implicit-def: $vgpr30
+; GCN-NEXT: ; implicit-def: $vgpr31
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB127_2
+; GCN-NEXT: ; %bb.1: ; %cmp.false
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v34
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v36
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v35
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v38
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v37
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v48
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v39
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v50
+; GCN-NEXT: s_waitcnt expcnt(6)
+; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v49
+; GCN-NEXT: s_waitcnt expcnt(5)
+; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v52
+; GCN-NEXT: s_waitcnt expcnt(4)
+; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v51
+; GCN-NEXT: s_waitcnt expcnt(3)
+; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v54
+; GCN-NEXT: s_waitcnt expcnt(2)
+; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v53
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v55
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v30, v34
+; GCN-NEXT: v_cvt_f32_f16_e32 v28, v33
+; GCN-NEXT: v_cvt_f32_f16_e32 v26, v36
+; GCN-NEXT: v_cvt_f32_f16_e32 v24, v35
+; GCN-NEXT: v_cvt_f32_f16_e32 v22, v38
+; GCN-NEXT: v_cvt_f32_f16_e32 v20, v37
+; GCN-NEXT: v_cvt_f32_f16_e32 v18, v48
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v39
+; GCN-NEXT: v_cvt_f32_f16_e32 v14, v50
+; GCN-NEXT: v_cvt_f32_f16_e32 v12, v49
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v52
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v51
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v54
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v53
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v29, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v27, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v25, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v23, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v21, v9
+; GCN-NEXT: v_cvt_f32_f16_e32 v19, v11
+; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13
+; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT: v_cvt_f32_f16_e32 v13, v40
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v41
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v43
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v44
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v45
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32
+; GCN-NEXT: ; implicit-def: $vgpr32
+; GCN-NEXT: ; implicit-def: $vgpr55
+; GCN-NEXT: ; implicit-def: $vgpr53
+; GCN-NEXT: ; implicit-def: $vgpr54
+; GCN-NEXT: ; implicit-def: $vgpr51
+; GCN-NEXT: ; implicit-def: $vgpr52
+; GCN-NEXT: ; implicit-def: $vgpr49
+; GCN-NEXT: ; implicit-def: $vgpr50
+; GCN-NEXT: ; implicit-def: $vgpr39
+; GCN-NEXT: ; implicit-def: $vgpr48
+; GCN-NEXT: ; implicit-def: $vgpr37
+; GCN-NEXT: ; implicit-def: $vgpr38
+; GCN-NEXT: ; implicit-def: $vgpr35
+; GCN-NEXT: ; implicit-def: $vgpr36
+; GCN-NEXT: ; implicit-def: $vgpr33
+; GCN-NEXT: ; implicit-def: $vgpr34
+; GCN-NEXT: .LBB127_2: ; %Flow
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB127_4
+; GCN-NEXT: ; %bb.3: ; %cmp.true
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v55, vcc
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53
+; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v54, vcc
+; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51
+; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v52, vcc
+; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49
+; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v50, vcc
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39
+; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v48, vcc
+; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37
+; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v38, vcc
+; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35
+; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v36, vcc
+; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33
+; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v34, vcc
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15
+; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15
+; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14
+; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13
+; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12
+; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11
+; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10
+; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8
+; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29
+; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27
+; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25
+; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23
+; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21
+; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GCN-NEXT: v_cvt_f32_f16_e32 v15, v39
+; GCN-NEXT: v_cvt_f32_f16_e32 v13, v38
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v37
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v35
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32
+; GCN-NEXT: .LBB127_4: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v8i64_to_v32f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; VI-NEXT: s_cbranch_execz .LBB127_2
+; VI-NEXT: ; %bb.1: ; %cmp.true
+; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14
+; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc
+; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12
+; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
+; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10
+; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc
+; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8
+; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6
+; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: .LBB127_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v8i64_to_v32f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB127_2
+; GFX9-NEXT: ; %bb.1: ; %cmp.true
+; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14
+; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12
+; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10
+; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: .LBB127_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v8i64_to_v32f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16
+; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT: s_cbranch_execz .LBB127_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v15, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v13, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v11, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: .LBB127_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+cmp.true:
+ %a1 = add <8 x i64> %a, splat (i64 3)
+ %a2 = bitcast <8 x i64> %a1 to <32 x half>
+ br label %end
+cmp.false:
+ %a3 = bitcast <8 x i64> %a to <32 x half>
+ br label %end
+end:
+ %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <32 x half> %phi
+}
+
+
+define <32 x i16> @v_bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) {
+; GCN-LABEL: v_bitcast_v8i64_to_v32i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v30, v15
+; GCN-NEXT: v_mov_b32_e32 v28, v14
+; GCN-NEXT: v_mov_b32_e32 v26, v13
+; GCN-NEXT: v_mov_b32_e32 v24, v12
+; GCN-NEXT: v_mov_b32_e32 v22, v11
+; GCN-NEXT: v_mov_b32_e32 v20, v10
+; GCN-NEXT: v_mov_b32_e32 v18, v9
+; GCN-NEXT: v_mov_b32_e32 v32, v8
+; GCN-NEXT: v_mov_b32_e32 v14, v7
+; GCN-NEXT: v_mov_b32_e32 v12, v6
+; GCN-NEXT: v_mov_b32_e32 v10, v5
+; GCN-NEXT: v_mov_b32_e32 v8, v4
+; GCN-NEXT: v_mov_b32_e32 v6, v3
+; GCN-NEXT: v_mov_b32_e32 v4, v2
+; GCN-NEXT: v_mov_b32_e32 v2, v1
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GCN-NEXT: ; implicit-def: $vgpr1
+; GCN-NEXT: ; implicit-def: $vgpr3
+; GCN-NEXT: ; implicit-def: $vgpr5
+; GCN-NEXT: ; implicit-def: $vgpr7
+; GCN-NEXT: ; implicit-def: $vgpr9
+; GCN-NEXT: ; implicit-def: $vgpr11
+; GCN-NEXT: ; implicit-def: $vgpr13
+; GCN-NEXT: ; implicit-def: $vgpr15
+; GCN-NEXT: ; implicit-def: $vgpr17
+; GCN-NEXT: ; implicit-def: $vgpr19
+; GCN-NEXT: ; implicit-def: $vgpr21
+; GCN-NEXT: ; implicit-def: $vgpr23
+; GCN-NEXT: ; implicit-def: $vgpr25
+; GCN-NEXT: ; implicit-def: $vgpr27
+; GCN-NEXT: ; implicit-def: $vgpr29
+; GCN-NEXT: ; implicit-def: $vgpr31
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB128_2
+; GCN-NEXT: ; %bb.1: ; %cmp.false
+; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16
+; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16
+; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16
+; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16
+; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16
+; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16
+; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16
+; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GCN-NEXT: .LBB128_2: ; %Flow
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB128_4
+; GCN-NEXT: ; %bb.3: ; %cmp.true
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8
+; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
+; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12
+; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc
+; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32
+; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc
+; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20
+; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
+; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24
+; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28
+; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc
+; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16
+; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16
+; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16
+; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16
+; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16
+; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16
+; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16
+; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GCN-NEXT: .LBB128_4: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v16, v32
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v8i64_to_v32i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; VI-NEXT: s_cbranch_execz .LBB128_2
+; VI-NEXT: ; %bb.1: ; %cmp.true
+; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14
+; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc
+; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12
+; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
+; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10
+; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc
+; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8
+; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6
+; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: .LBB128_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v8i64_to_v32i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB128_2
+; GFX9-NEXT: ; %bb.1: ; %cmp.true
+; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14
+; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12
+; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10
+; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: .LBB128_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v8i64_to_v32i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16
+; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT: s_cbranch_execz .LBB128_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v15, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v13, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v11, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: .LBB128_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+cmp.true:
+ %a1 = add <8 x i64> %a, splat (i64 3)
+ %a2 = bitcast <8 x i64> %a1 to <32 x i16>
+ br label %end
+cmp.false:
+ %a3 = bitcast <8 x i64> %a to <32 x i16>
+ br label %end
+end:
+ %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <32 x i16> %phi
+}
+
+
+define <32 x i16> @v_bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) {
+; GCN-LABEL: v_bitcast_v8f64_to_v32i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v55, v15
+; GCN-NEXT: v_mov_b32_e32 v54, v14
+; GCN-NEXT: v_mov_b32_e32 v53, v13
+; GCN-NEXT: v_mov_b32_e32 v52, v12
+; GCN-NEXT: v_mov_b32_e32 v51, v11
+; GCN-NEXT: v_mov_b32_e32 v50, v10
+; GCN-NEXT: v_mov_b32_e32 v49, v9
+; GCN-NEXT: v_mov_b32_e32 v48, v8
+; GCN-NEXT: v_mov_b32_e32 v38, v7
+; GCN-NEXT: v_mov_b32_e32 v37, v6
+; GCN-NEXT: v_mov_b32_e32 v36, v5
+; GCN-NEXT: v_mov_b32_e32 v35, v4
+; GCN-NEXT: v_mov_b32_e32 v34, v3
+; GCN-NEXT: v_mov_b32_e32 v33, v2
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GCN-NEXT: ; implicit-def: $vgpr32
+; GCN-NEXT: ; implicit-def: $vgpr3
+; GCN-NEXT: ; implicit-def: $vgpr5
+; GCN-NEXT: ; implicit-def: $vgpr7
+; GCN-NEXT: ; implicit-def: $vgpr9
+; GCN-NEXT: ; implicit-def: $vgpr11
+; GCN-NEXT: ; implicit-def: $vgpr13
+; GCN-NEXT: ; implicit-def: $vgpr15
+; GCN-NEXT: ; implicit-def: $vgpr17
+; GCN-NEXT: ; implicit-def: $vgpr19
+; GCN-NEXT: ; implicit-def: $vgpr21
+; GCN-NEXT: ; implicit-def: $vgpr23
+; GCN-NEXT: ; implicit-def: $vgpr25
+; GCN-NEXT: ; implicit-def: $vgpr27
+; GCN-NEXT: ; implicit-def: $vgpr29
+; GCN-NEXT: ; implicit-def: $vgpr31
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB129_2
+; GCN-NEXT: ; %bb.1: ; %cmp.false
+; GCN-NEXT: v_alignbit_b32 v29, v55, v54, 16
+; GCN-NEXT: v_alignbit_b32 v25, v53, v52, 16
+; GCN-NEXT: v_alignbit_b32 v21, v51, v50, 16
+; GCN-NEXT: v_alignbit_b32 v17, v49, v48, 16
+; GCN-NEXT: v_alignbit_b32 v13, v38, v37, 16
+; GCN-NEXT: v_alignbit_b32 v9, v36, v35, 16
+; GCN-NEXT: v_alignbit_b32 v5, v34, v33, 16
+; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v53
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v51
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v49
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v38
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v34
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GCN-NEXT: .LBB129_2: ; %Flow
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB129_4
+; GCN-NEXT: ; %bb.3: ; %cmp.true
+; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GCN-NEXT: v_add_f64 v[33:34], v[33:34], 1.0
+; GCN-NEXT: v_add_f64 v[35:36], v[35:36], 1.0
+; GCN-NEXT: v_add_f64 v[37:38], v[37:38], 1.0
+; GCN-NEXT: v_add_f64 v[48:49], v[48:49], 1.0
+; GCN-NEXT: v_add_f64 v[50:51], v[50:51], 1.0
+; GCN-NEXT: v_add_f64 v[52:53], v[52:53], 1.0
+; GCN-NEXT: v_add_f64 v[54:55], v[54:55], 1.0
+; GCN-NEXT: v_alignbit_b32 v29, v55, v54, 16
+; GCN-NEXT: v_alignbit_b32 v25, v53, v52, 16
+; GCN-NEXT: v_alignbit_b32 v21, v51, v50, 16
+; GCN-NEXT: v_alignbit_b32 v17, v49, v48, 16
+; GCN-NEXT: v_alignbit_b32 v13, v38, v37, 16
+; GCN-NEXT: v_alignbit_b32 v9, v36, v35, 16
+; GCN-NEXT: v_alignbit_b32 v5, v34, v33, 16
+; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v53
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v51
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v49
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v38
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v34
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GCN-NEXT: .LBB129_4: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v1
+; GCN-NEXT: v_mov_b32_e32 v4, v33
+; GCN-NEXT: v_mov_b32_e32 v6, v34
+; GCN-NEXT: v_mov_b32_e32 v8, v35
+; GCN-NEXT: v_mov_b32_e32 v10, v36
+; GCN-NEXT: v_mov_b32_e32 v12, v37
+; GCN-NEXT: v_mov_b32_e32 v14, v38
+; GCN-NEXT: v_mov_b32_e32 v16, v48
+; GCN-NEXT: v_mov_b32_e32 v18, v49
+; GCN-NEXT: v_mov_b32_e32 v20, v50
+; GCN-NEXT: v_mov_b32_e32 v22, v51
+; GCN-NEXT: v_mov_b32_e32 v24, v52
+; GCN-NEXT: v_mov_b32_e32 v26, v53
+; GCN-NEXT: v_mov_b32_e32 v28, v54
+; GCN-NEXT: v_mov_b32_e32 v30, v55
+; GCN-NEXT: v_mov_b32_e32 v1, v32
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v8f64_to_v32i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; VI-NEXT: s_cbranch_execz .LBB129_2
+; VI-NEXT: ; %bb.1: ; %cmp.true
+; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; VI-NEXT: .LBB129_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v8f64_to_v32i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB129_2
+; GFX9-NEXT: ; %bb.1: ; %cmp.true
+; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX9-NEXT: .LBB129_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v8f64_to_v32i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16
+; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT: s_cbranch_execz .LBB129_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-NEXT: .LBB129_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+cmp.true:
+ %a1 = fadd <8 x double> %a, splat (double 1.000000e+00)
+ %a2 = bitcast <8 x double> %a1 to <32 x i16>
+ br label %end
+cmp.false:
+ %a3 = bitcast <8 x double> %a to <32 x i16>
+ br label %end
+end:
+ %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <32 x i16> %phi
+}
+
+
+define <32 x half> @v_bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) {
+; GCN-LABEL: v_bitcast_v8f64_to_v32f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GCN-NEXT: ; implicit-def: $vgpr39
+; GCN-NEXT: ; implicit-def: $vgpr55
+; GCN-NEXT: ; implicit-def: $vgpr32
+; GCN-NEXT: ; implicit-def: $vgpr54
+; GCN-NEXT: ; implicit-def: $vgpr33
+; GCN-NEXT: ; implicit-def: $vgpr53
+; GCN-NEXT: ; implicit-def: $vgpr34
+; GCN-NEXT: ; implicit-def: $vgpr52
+; GCN-NEXT: ; implicit-def: $vgpr35
+; GCN-NEXT: ; implicit-def: $vgpr51
+; GCN-NEXT: ; implicit-def: $vgpr36
+; GCN-NEXT: ; implicit-def: $vgpr50
+; GCN-NEXT: ; implicit-def: $vgpr37
+; GCN-NEXT: ; implicit-def: $vgpr49
+; GCN-NEXT: ; implicit-def: $vgpr38
+; GCN-NEXT: ; implicit-def: $vgpr48
+; GCN-NEXT: ; implicit-def: $vgpr16
+; GCN-NEXT: ; implicit-def: $vgpr17
+; GCN-NEXT: ; implicit-def: $vgpr18
+; GCN-NEXT: ; implicit-def: $vgpr19
+; GCN-NEXT: ; implicit-def: $vgpr20
+; GCN-NEXT: ; implicit-def: $vgpr21
+; GCN-NEXT: ; implicit-def: $vgpr22
+; GCN-NEXT: ; implicit-def: $vgpr23
+; GCN-NEXT: ; implicit-def: $vgpr24
+; GCN-NEXT: ; implicit-def: $vgpr25
+; GCN-NEXT: ; implicit-def: $vgpr26
+; GCN-NEXT: ; implicit-def: $vgpr27
+; GCN-NEXT: ; implicit-def: $vgpr28
+; GCN-NEXT: ; implicit-def: $vgpr29
+; GCN-NEXT: ; implicit-def: $vgpr30
+; GCN-NEXT: ; implicit-def: $vgpr31
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB130_2
+; GCN-NEXT: ; %bb.1: ; %cmp.false
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v3
+; GCN-NEXT: s_waitcnt expcnt(2)
+; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v1
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15
+; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14
+; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13
+; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12
+; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11
+; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10
+; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8
+; GCN-NEXT: v_cvt_f32_f16_e32 v38, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v36, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v34, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v33, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v17
+; GCN-NEXT: v_cvt_f32_f16_e32 v29, v19
+; GCN-NEXT: v_cvt_f32_f16_e32 v27, v21
+; GCN-NEXT: v_cvt_f32_f16_e32 v25, v23
+; GCN-NEXT: v_cvt_f32_f16_e32 v23, v39
+; GCN-NEXT: v_cvt_f32_f16_e32 v21, v48
+; GCN-NEXT: v_cvt_f32_f16_e32 v19, v49
+; GCN-NEXT: v_cvt_f32_f16_e32 v17, v50
+; GCN-NEXT: v_cvt_f32_f16_e32 v48, v51
+; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52
+; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53
+; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54
+; GCN-NEXT: v_cvt_f32_f16_e32 v52, v55
+; GCN-NEXT: v_cvt_f32_f16_e32 v53, v40
+; GCN-NEXT: v_cvt_f32_f16_e32 v54, v41
+; GCN-NEXT: v_cvt_f32_f16_e32 v55, v42
+; GCN-NEXT: v_cvt_f32_f16_e32 v39, v0
+; GCN-NEXT: ; implicit-def: $vgpr0
+; GCN-NEXT: ; implicit-def: $vgpr2
+; GCN-NEXT: ; implicit-def: $vgpr4
+; GCN-NEXT: ; implicit-def: $vgpr6
+; GCN-NEXT: ; implicit-def: $vgpr8
+; GCN-NEXT: ; implicit-def: $vgpr10
+; GCN-NEXT: ; implicit-def: $vgpr12
+; GCN-NEXT: ; implicit-def: $vgpr14
+; GCN-NEXT: .LBB130_2: ; %Flow
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB130_4
+; GCN-NEXT: ; %bb.3: ; %cmp.true
+; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15
+; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15
+; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14
+; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13
+; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12
+; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11
+; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10
+; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8
+; GCN-NEXT: v_cvt_f32_f16_e32 v38, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v36, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v34, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v33, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v39, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29
+; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27
+; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25
+; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23
+; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21
+; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48
+; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49
+; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50
+; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51
+; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52
+; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53
+; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54
+; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55
+; GCN-NEXT: .LBB130_4: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v0, v39
+; GCN-NEXT: v_mov_b32_e32 v1, v55
+; GCN-NEXT: v_mov_b32_e32 v2, v32
+; GCN-NEXT: v_mov_b32_e32 v3, v54
+; GCN-NEXT: v_mov_b32_e32 v4, v33
+; GCN-NEXT: v_mov_b32_e32 v5, v53
+; GCN-NEXT: v_mov_b32_e32 v6, v34
+; GCN-NEXT: v_mov_b32_e32 v7, v52
+; GCN-NEXT: v_mov_b32_e32 v8, v35
+; GCN-NEXT: v_mov_b32_e32 v9, v51
+; GCN-NEXT: v_mov_b32_e32 v10, v36
+; GCN-NEXT: v_mov_b32_e32 v11, v50
+; GCN-NEXT: v_mov_b32_e32 v12, v37
+; GCN-NEXT: v_mov_b32_e32 v13, v49
+; GCN-NEXT: v_mov_b32_e32 v14, v38
+; GCN-NEXT: v_mov_b32_e32 v15, v48
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v8f64_to_v32f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; VI-NEXT: s_cbranch_execz .LBB130_2
+; VI-NEXT: ; %bb.1: ; %cmp.true
+; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; VI-NEXT: .LBB130_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v8f64_to_v32f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB130_2
+; GFX9-NEXT: ; %bb.1: ; %cmp.true
+; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX9-NEXT: .LBB130_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v8f64_to_v32f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16
+; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT: s_cbranch_execz .LBB130_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-NEXT: .LBB130_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+cmp.true:
+ %a1 = fadd <8 x double> %a, splat (double 1.000000e+00)
+ %a2 = bitcast <8 x double> %a1 to <32 x half>
+ br label %end
+cmp.false:
+ %a3 = bitcast <8 x double> %a to <32 x half>
+ br label %end
+end:
+ %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <32 x half> %phi
+}
+
+
define <8 x i64> @v_bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) {
+; GCN-LABEL: v_bitcast_v32f16_to_v8i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4
+; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9
+; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8
+; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11
+; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10
+; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13
+; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12
+; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15
+; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14
+; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17
+; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16
+; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19
+; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18
+; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21
+; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20
+; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23
+; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22
+; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25
+; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24
+; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27
+; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26
+; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29
+; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47
+; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46
+; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30
+; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB131_2
+; GCN-NEXT: ; %bb.1: ; %cmp.false
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43
+; GCN-NEXT: v_or_b32_e32 v0, v44, v0
+; GCN-NEXT: v_or_b32_e32 v1, v42, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20
+; GCN-NEXT: v_or_b32_e32 v2, v52, v2
+; GCN-NEXT: v_or_b32_e32 v3, v50, v3
+; GCN-NEXT: v_or_b32_e32 v4, v48, v4
+; GCN-NEXT: v_or_b32_e32 v5, v38, v5
+; GCN-NEXT: v_or_b32_e32 v6, v36, v6
+; GCN-NEXT: v_or_b32_e32 v7, v34, v7
+; GCN-NEXT: v_or_b32_e32 v8, v33, v8
+; GCN-NEXT: v_or_b32_e32 v9, v32, v9
+; GCN-NEXT: v_or_b32_e32 v10, v31, v10
+; GCN-NEXT: v_or_b32_e32 v11, v21, v11
+; GCN-NEXT: v_or_b32_e32 v12, v19, v12
+; GCN-NEXT: v_or_b32_e32 v13, v18, v13
+; GCN-NEXT: v_or_b32_e32 v14, v17, v14
+; GCN-NEXT: v_or_b32_e32 v15, v16, v15
+; GCN-NEXT: ; implicit-def: $vgpr45
+; GCN-NEXT: ; implicit-def: $vgpr44
+; GCN-NEXT: ; implicit-def: $vgpr43
+; GCN-NEXT: ; implicit-def: $vgpr42
+; GCN-NEXT: ; implicit-def: $vgpr41
+; GCN-NEXT: ; implicit-def: $vgpr52
+; GCN-NEXT: ; implicit-def: $vgpr40
+; GCN-NEXT: ; implicit-def: $vgpr50
+; GCN-NEXT: ; implicit-def: $vgpr55
+; GCN-NEXT: ; implicit-def: $vgpr48
+; GCN-NEXT: ; implicit-def: $vgpr54
+; GCN-NEXT: ; implicit-def: $vgpr38
+; GCN-NEXT: ; implicit-def: $vgpr53
+; GCN-NEXT: ; implicit-def: $vgpr36
+; GCN-NEXT: ; implicit-def: $vgpr51
+; GCN-NEXT: ; implicit-def: $vgpr34
+; GCN-NEXT: ; implicit-def: $vgpr49
+; GCN-NEXT: ; implicit-def: $vgpr33
+; GCN-NEXT: ; implicit-def: $vgpr39
+; GCN-NEXT: ; implicit-def: $vgpr32
+; GCN-NEXT: ; implicit-def: $vgpr37
+; GCN-NEXT: ; implicit-def: $vgpr31
+; GCN-NEXT: ; implicit-def: $vgpr35
+; GCN-NEXT: ; implicit-def: $vgpr21
+; GCN-NEXT: ; implicit-def: $vgpr25
+; GCN-NEXT: ; implicit-def: $vgpr19
+; GCN-NEXT: ; implicit-def: $vgpr23
+; GCN-NEXT: ; implicit-def: $vgpr18
+; GCN-NEXT: ; implicit-def: $vgpr22
+; GCN-NEXT: ; implicit-def: $vgpr17
+; GCN-NEXT: ; implicit-def: $vgpr20
+; GCN-NEXT: ; implicit-def: $vgpr16
+; GCN-NEXT: .LBB131_2: ; %Flow
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB131_4
+; GCN-NEXT: ; %bb.3: ; %cmp.true
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42
+; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0
+; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2
+; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_or_b32_e32 v1, v3, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36
+; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51
+; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34
+; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49
+; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33
+; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39
+; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37
+; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35
+; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21
+; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25
+; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23
+; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22
+; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2
+; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4
+; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5
+; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6
+; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7
+; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8
+; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9
+; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10
+; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11
+; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12
+; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13
+; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14
+; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15
+; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24
+; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26
+; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27
+; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28
+; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29
+; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21
+; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25
+; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19
+; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23
+; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18
+; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22
+; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17
+; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20
+; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24
+; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26
+; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27
+; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28
+; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29
+; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21
+; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25
+; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19
+; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23
+; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18
+; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22
+; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17
+; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20
+; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_or_b32_e32 v2, v3, v2
+; GCN-NEXT: v_or_b32_e32 v3, v5, v4
+; GCN-NEXT: v_or_b32_e32 v4, v7, v6
+; GCN-NEXT: v_or_b32_e32 v5, v9, v8
+; GCN-NEXT: v_or_b32_e32 v6, v11, v10
+; GCN-NEXT: v_or_b32_e32 v7, v13, v12
+; GCN-NEXT: v_or_b32_e32 v8, v15, v14
+; GCN-NEXT: v_or_b32_e32 v9, v26, v24
+; GCN-NEXT: v_or_b32_e32 v10, v28, v27
+; GCN-NEXT: v_or_b32_e32 v11, v21, v29
+; GCN-NEXT: v_or_b32_e32 v12, v19, v25
+; GCN-NEXT: v_or_b32_e32 v13, v18, v23
+; GCN-NEXT: v_or_b32_e32 v14, v17, v22
+; GCN-NEXT: v_or_b32_e32 v15, v16, v20
+; GCN-NEXT: .LBB131_4: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v32f16_to_v8i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; VI-NEXT: s_cbranch_execz .LBB131_2
+; VI-NEXT: ; %bb.1: ; %cmp.true
+; VI-NEXT: v_mov_b32_e32 v16, 0x200
+; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v15, 0x200, v15
+; VI-NEXT: v_or_b32_e32 v15, v15, v17
+; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v14, 0x200, v14
+; VI-NEXT: v_or_b32_e32 v14, v14, v17
+; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v13, 0x200, v13
+; VI-NEXT: v_or_b32_e32 v13, v13, v17
+; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v12, 0x200, v12
+; VI-NEXT: v_or_b32_e32 v12, v12, v17
+; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v11, 0x200, v11
+; VI-NEXT: v_or_b32_e32 v11, v11, v17
+; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v10, 0x200, v10
+; VI-NEXT: v_or_b32_e32 v10, v10, v17
+; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v9, 0x200, v9
+; VI-NEXT: v_or_b32_e32 v9, v9, v17
+; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v8, 0x200, v8
+; VI-NEXT: v_or_b32_e32 v8, v8, v17
+; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v7, 0x200, v7
+; VI-NEXT: v_or_b32_e32 v7, v7, v17
+; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v6, 0x200, v6
+; VI-NEXT: v_or_b32_e32 v6, v6, v17
+; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v5, 0x200, v5
+; VI-NEXT: v_or_b32_e32 v5, v5, v17
+; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v4, 0x200, v4
+; VI-NEXT: v_or_b32_e32 v4, v4, v17
+; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v3, 0x200, v3
+; VI-NEXT: v_or_b32_e32 v3, v3, v17
+; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v2, 0x200, v2
+; VI-NEXT: v_or_b32_e32 v2, v2, v17
+; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v1, 0x200, v1
+; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v0, 0x200, v0
+; VI-NEXT: v_or_b32_e32 v1, v1, v17
+; VI-NEXT: v_or_b32_e32 v0, v0, v16
+; VI-NEXT: .LBB131_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v32f16_to_v8i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB131_2
+; GFX9-NEXT: ; %bb.1: ; %cmp.true
+; GFX9-NEXT: s_movk_i32 s6, 0x200
+; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: .LBB131_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v32f16_to_v8i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16
+; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT: s_cbranch_execz .LBB131_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-NEXT: .LBB131_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+cmp.true:
+ %a1 = fadd <32 x half> %a, splat (half 0xH0200)
+ %a2 = bitcast <32 x half> %a1 to <8 x i64>
+ br label %end
+cmp.false:
+ %a3 = bitcast <32 x half> %a to <8 x i64>
+ br label %end
+end:
+ %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <8 x i64> %phi
+}
+
+
+define <8 x double> @v_bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) {
+; GCN-LABEL: v_bitcast_v32f16_to_v8f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4
+; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9
+; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8
+; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11
+; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10
+; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13
+; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12
+; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15
+; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14
+; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17
+; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16
+; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19
+; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18
+; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21
+; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20
+; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23
+; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22
+; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25
+; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24
+; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27
+; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26
+; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29
+; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47
+; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46
+; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30
+; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB132_2
+; GCN-NEXT: ; %bb.1: ; %cmp.false
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43
+; GCN-NEXT: v_or_b32_e32 v0, v44, v0
+; GCN-NEXT: v_or_b32_e32 v1, v42, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20
+; GCN-NEXT: v_or_b32_e32 v2, v52, v2
+; GCN-NEXT: v_or_b32_e32 v3, v50, v3
+; GCN-NEXT: v_or_b32_e32 v4, v48, v4
+; GCN-NEXT: v_or_b32_e32 v5, v38, v5
+; GCN-NEXT: v_or_b32_e32 v6, v36, v6
+; GCN-NEXT: v_or_b32_e32 v7, v34, v7
+; GCN-NEXT: v_or_b32_e32 v8, v33, v8
+; GCN-NEXT: v_or_b32_e32 v9, v32, v9
+; GCN-NEXT: v_or_b32_e32 v10, v31, v10
+; GCN-NEXT: v_or_b32_e32 v11, v21, v11
+; GCN-NEXT: v_or_b32_e32 v12, v19, v12
+; GCN-NEXT: v_or_b32_e32 v13, v18, v13
+; GCN-NEXT: v_or_b32_e32 v14, v17, v14
+; GCN-NEXT: v_or_b32_e32 v15, v16, v15
+; GCN-NEXT: ; implicit-def: $vgpr45
+; GCN-NEXT: ; implicit-def: $vgpr44
+; GCN-NEXT: ; implicit-def: $vgpr43
+; GCN-NEXT: ; implicit-def: $vgpr42
+; GCN-NEXT: ; implicit-def: $vgpr41
+; GCN-NEXT: ; implicit-def: $vgpr52
+; GCN-NEXT: ; implicit-def: $vgpr40
+; GCN-NEXT: ; implicit-def: $vgpr50
+; GCN-NEXT: ; implicit-def: $vgpr55
+; GCN-NEXT: ; implicit-def: $vgpr48
+; GCN-NEXT: ; implicit-def: $vgpr54
+; GCN-NEXT: ; implicit-def: $vgpr38
+; GCN-NEXT: ; implicit-def: $vgpr53
+; GCN-NEXT: ; implicit-def: $vgpr36
+; GCN-NEXT: ; implicit-def: $vgpr51
+; GCN-NEXT: ; implicit-def: $vgpr34
+; GCN-NEXT: ; implicit-def: $vgpr49
+; GCN-NEXT: ; implicit-def: $vgpr33
+; GCN-NEXT: ; implicit-def: $vgpr39
+; GCN-NEXT: ; implicit-def: $vgpr32
+; GCN-NEXT: ; implicit-def: $vgpr37
+; GCN-NEXT: ; implicit-def: $vgpr31
+; GCN-NEXT: ; implicit-def: $vgpr35
+; GCN-NEXT: ; implicit-def: $vgpr21
+; GCN-NEXT: ; implicit-def: $vgpr25
+; GCN-NEXT: ; implicit-def: $vgpr19
+; GCN-NEXT: ; implicit-def: $vgpr23
+; GCN-NEXT: ; implicit-def: $vgpr18
+; GCN-NEXT: ; implicit-def: $vgpr22
+; GCN-NEXT: ; implicit-def: $vgpr17
+; GCN-NEXT: ; implicit-def: $vgpr20
+; GCN-NEXT: ; implicit-def: $vgpr16
+; GCN-NEXT: .LBB132_2: ; %Flow
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB132_4
+; GCN-NEXT: ; %bb.3: ; %cmp.true
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42
+; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0
+; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2
+; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_or_b32_e32 v1, v3, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36
+; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51
+; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34
+; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49
+; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33
+; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39
+; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37
+; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35
+; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21
+; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25
+; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23
+; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22
+; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2
+; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4
+; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5
+; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6
+; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7
+; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8
+; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9
+; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10
+; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11
+; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12
+; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13
+; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14
+; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15
+; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24
+; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26
+; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27
+; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28
+; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29
+; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21
+; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25
+; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19
+; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23
+; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18
+; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22
+; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17
+; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20
+; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24
+; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26
+; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27
+; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28
+; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29
+; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21
+; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25
+; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19
+; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23
+; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18
+; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22
+; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17
+; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20
+; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_or_b32_e32 v2, v3, v2
+; GCN-NEXT: v_or_b32_e32 v3, v5, v4
+; GCN-NEXT: v_or_b32_e32 v4, v7, v6
+; GCN-NEXT: v_or_b32_e32 v5, v9, v8
+; GCN-NEXT: v_or_b32_e32 v6, v11, v10
+; GCN-NEXT: v_or_b32_e32 v7, v13, v12
+; GCN-NEXT: v_or_b32_e32 v8, v15, v14
+; GCN-NEXT: v_or_b32_e32 v9, v26, v24
+; GCN-NEXT: v_or_b32_e32 v10, v28, v27
+; GCN-NEXT: v_or_b32_e32 v11, v21, v29
+; GCN-NEXT: v_or_b32_e32 v12, v19, v25
+; GCN-NEXT: v_or_b32_e32 v13, v18, v23
+; GCN-NEXT: v_or_b32_e32 v14, v17, v22
+; GCN-NEXT: v_or_b32_e32 v15, v16, v20
+; GCN-NEXT: .LBB132_4: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v32f16_to_v8f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; VI-NEXT: s_cbranch_execz .LBB132_2
+; VI-NEXT: ; %bb.1: ; %cmp.true
+; VI-NEXT: v_mov_b32_e32 v16, 0x200
+; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v15, 0x200, v15
+; VI-NEXT: v_or_b32_e32 v15, v15, v17
+; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v14, 0x200, v14
+; VI-NEXT: v_or_b32_e32 v14, v14, v17
+; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v13, 0x200, v13
+; VI-NEXT: v_or_b32_e32 v13, v13, v17
+; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v12, 0x200, v12
+; VI-NEXT: v_or_b32_e32 v12, v12, v17
+; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v11, 0x200, v11
+; VI-NEXT: v_or_b32_e32 v11, v11, v17
+; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v10, 0x200, v10
+; VI-NEXT: v_or_b32_e32 v10, v10, v17
+; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v9, 0x200, v9
+; VI-NEXT: v_or_b32_e32 v9, v9, v17
+; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v8, 0x200, v8
+; VI-NEXT: v_or_b32_e32 v8, v8, v17
+; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v7, 0x200, v7
+; VI-NEXT: v_or_b32_e32 v7, v7, v17
+; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v6, 0x200, v6
+; VI-NEXT: v_or_b32_e32 v6, v6, v17
+; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v5, 0x200, v5
+; VI-NEXT: v_or_b32_e32 v5, v5, v17
+; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v4, 0x200, v4
+; VI-NEXT: v_or_b32_e32 v4, v4, v17
+; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v3, 0x200, v3
+; VI-NEXT: v_or_b32_e32 v3, v3, v17
+; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v2, 0x200, v2
+; VI-NEXT: v_or_b32_e32 v2, v2, v17
+; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v1, 0x200, v1
+; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v0, 0x200, v0
+; VI-NEXT: v_or_b32_e32 v1, v1, v17
+; VI-NEXT: v_or_b32_e32 v0, v0, v16
+; VI-NEXT: .LBB132_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v32f16_to_v8f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB132_2
+; GFX9-NEXT: ; %bb.1: ; %cmp.true
+; GFX9-NEXT: s_movk_i32 s6, 0x200
+; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: .LBB132_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v32f16_to_v8f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16
+; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT: s_cbranch_execz .LBB132_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-NEXT: .LBB132_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
cmp.true:
%a1 = fadd <32 x half> %a, splat (half 0xH0200)
- %a2 = bitcast <32 x half> %a1 to <8 x i64>
+ %a2 = bitcast <32 x half> %a1 to <8 x double>
br label %end
cmp.false:
- %a3 = bitcast <32 x half> %a to <8 x i64>
+ %a3 = bitcast <32 x half> %a to <8 x double>
+ br label %end
+end:
+ %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <8 x double> %phi
+}
+
+
+define <8 x i64> @v_bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) {
+; GCN-LABEL: v_bitcast_v32i16_to_v8i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v38, v14
+; GCN-NEXT: v_mov_b32_e32 v37, v12
+; GCN-NEXT: v_mov_b32_e32 v36, v10
+; GCN-NEXT: v_mov_b32_e32 v35, v8
+; GCN-NEXT: v_mov_b32_e32 v34, v6
+; GCN-NEXT: v_mov_b32_e32 v33, v4
+; GCN-NEXT: v_mov_b32_e32 v32, v2
+; GCN-NEXT: v_mov_b32_e32 v31, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
+; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0
+; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-NEXT: s_cbranch_execnz .LBB133_3
+; GCN-NEXT: ; %bb.1: ; %Flow
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT: s_cbranch_execnz .LBB133_4
+; GCN-NEXT: .LBB133_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN-NEXT: .LBB133_3: ; %cmp.false
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v54
+; GCN-NEXT: v_or_b32_e32 v1, v1, v55
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30
+; GCN-NEXT: v_or_b32_e32 v2, v2, v39
+; GCN-NEXT: v_or_b32_e32 v3, v3, v48
+; GCN-NEXT: v_or_b32_e32 v4, v4, v49
+; GCN-NEXT: v_or_b32_e32 v5, v5, v50
+; GCN-NEXT: v_or_b32_e32 v6, v6, v51
+; GCN-NEXT: v_or_b32_e32 v7, v7, v52
+; GCN-NEXT: v_or_b32_e32 v8, v8, v17
+; GCN-NEXT: v_or_b32_e32 v9, v9, v19
+; GCN-NEXT: v_or_b32_e32 v10, v10, v21
+; GCN-NEXT: v_or_b32_e32 v11, v11, v23
+; GCN-NEXT: v_or_b32_e32 v12, v12, v25
+; GCN-NEXT: v_or_b32_e32 v13, v13, v27
+; GCN-NEXT: v_or_b32_e32 v14, v14, v29
+; GCN-NEXT: v_or_b32_e32 v15, v15, v53
+; GCN-NEXT: ; implicit-def: $vgpr31
+; GCN-NEXT: ; implicit-def: $vgpr32
+; GCN-NEXT: ; implicit-def: $vgpr33
+; GCN-NEXT: ; implicit-def: $vgpr34
+; GCN-NEXT: ; implicit-def: $vgpr35
+; GCN-NEXT: ; implicit-def: $vgpr36
+; GCN-NEXT: ; implicit-def: $vgpr37
+; GCN-NEXT: ; implicit-def: $vgpr38
+; GCN-NEXT: ; implicit-def: $vgpr16
+; GCN-NEXT: ; implicit-def: $vgpr18
+; GCN-NEXT: ; implicit-def: $vgpr20
+; GCN-NEXT: ; implicit-def: $vgpr22
+; GCN-NEXT: ; implicit-def: $vgpr24
+; GCN-NEXT: ; implicit-def: $vgpr26
+; GCN-NEXT: ; implicit-def: $vgpr28
+; GCN-NEXT: ; implicit-def: $vgpr30
+; GCN-NEXT: ; implicit-def: $vgpr54
+; GCN-NEXT: ; implicit-def: $vgpr55
+; GCN-NEXT: ; implicit-def: $vgpr39
+; GCN-NEXT: ; implicit-def: $vgpr48
+; GCN-NEXT: ; implicit-def: $vgpr49
+; GCN-NEXT: ; implicit-def: $vgpr50
+; GCN-NEXT: ; implicit-def: $vgpr51
+; GCN-NEXT: ; implicit-def: $vgpr52
+; GCN-NEXT: ; implicit-def: $vgpr17
+; GCN-NEXT: ; implicit-def: $vgpr19
+; GCN-NEXT: ; implicit-def: $vgpr21
+; GCN-NEXT: ; implicit-def: $vgpr23
+; GCN-NEXT: ; implicit-def: $vgpr25
+; GCN-NEXT: ; implicit-def: $vgpr27
+; GCN-NEXT: ; implicit-def: $vgpr29
+; GCN-NEXT: ; implicit-def: $vgpr53
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB133_2
+; GCN-NEXT: .LBB133_4: ; %cmp.true
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31
+; GCN-NEXT: s_mov_b32 s6, 0x30000
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33
+; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34
+; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35
+; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36
+; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16
+; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18
+; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20
+; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22
+; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26
+; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28
+; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GCN-NEXT: v_or_b32_e32 v0, v54, v0
+; GCN-NEXT: v_or_b32_e32 v1, v55, v1
+; GCN-NEXT: v_or_b32_e32 v2, v39, v2
+; GCN-NEXT: v_or_b32_e32 v3, v48, v3
+; GCN-NEXT: v_or_b32_e32 v4, v49, v4
+; GCN-NEXT: v_or_b32_e32 v5, v50, v5
+; GCN-NEXT: v_or_b32_e32 v6, v51, v6
+; GCN-NEXT: v_or_b32_e32 v7, v52, v7
+; GCN-NEXT: v_or_b32_e32 v8, v17, v8
+; GCN-NEXT: v_or_b32_e32 v9, v19, v9
+; GCN-NEXT: v_or_b32_e32 v10, v21, v10
+; GCN-NEXT: v_or_b32_e32 v11, v23, v11
+; GCN-NEXT: v_or_b32_e32 v12, v25, v12
+; GCN-NEXT: v_or_b32_e32 v13, v27, v13
+; GCN-NEXT: v_or_b32_e32 v14, v29, v14
+; GCN-NEXT: v_or_b32_e32 v15, v53, v15
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1
+; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2
+; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3
+; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4
+; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5
+; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6
+; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7
+; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8
+; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9
+; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10
+; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11
+; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13
+; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14
+; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v32i16_to_v8i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; VI-NEXT: s_cbranch_execz .LBB133_2
+; VI-NEXT: ; %bb.1: ; %cmp.true
+; VI-NEXT: v_mov_b32_e32 v17, 3
+; VI-NEXT: v_add_u16_e32 v16, 3, v15
+; VI-NEXT: v_add_u16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v15, v16, v15
+; VI-NEXT: v_add_u16_e32 v16, 3, v14
+; VI-NEXT: v_add_u16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v14, v16, v14
+; VI-NEXT: v_add_u16_e32 v16, 3, v13
+; VI-NEXT: v_add_u16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v13, v16, v13
+; VI-NEXT: v_add_u16_e32 v16, 3, v12
+; VI-NEXT: v_add_u16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v12, v16, v12
+; VI-NEXT: v_add_u16_e32 v16, 3, v11
+; VI-NEXT: v_add_u16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v11, v16, v11
+; VI-NEXT: v_add_u16_e32 v16, 3, v10
+; VI-NEXT: v_add_u16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v10, v16, v10
+; VI-NEXT: v_add_u16_e32 v16, 3, v9
+; VI-NEXT: v_add_u16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v9, v16, v9
+; VI-NEXT: v_add_u16_e32 v16, 3, v8
+; VI-NEXT: v_add_u16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v8, v16, v8
+; VI-NEXT: v_add_u16_e32 v16, 3, v7
+; VI-NEXT: v_add_u16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v7, v16, v7
+; VI-NEXT: v_add_u16_e32 v16, 3, v6
+; VI-NEXT: v_add_u16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v6, v16, v6
+; VI-NEXT: v_add_u16_e32 v16, 3, v5
+; VI-NEXT: v_add_u16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v5, v16, v5
+; VI-NEXT: v_add_u16_e32 v16, 3, v4
+; VI-NEXT: v_add_u16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v4, v16, v4
+; VI-NEXT: v_add_u16_e32 v16, 3, v3
+; VI-NEXT: v_add_u16_sdwa v3, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v3, v16, v3
+; VI-NEXT: v_add_u16_e32 v16, 3, v2
+; VI-NEXT: v_add_u16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v16, v2
+; VI-NEXT: v_add_u16_e32 v16, 3, v1
+; VI-NEXT: v_add_u16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v1, v16, v1
+; VI-NEXT: v_add_u16_e32 v16, 3, v0
+; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v16, v0
+; VI-NEXT: .LBB133_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v32i16_to_v8i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB133_2
+; GFX9-NEXT: ; %bb.1: ; %cmp.true
+; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: .LBB133_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v32i16_to_v8i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16
+; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT: s_cbranch_execz .LBB133_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: .LBB133_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+cmp.true:
+ %a1 = add <32 x i16> %a, splat (i16 3)
+ %a2 = bitcast <32 x i16> %a1 to <8 x i64>
+ br label %end
+cmp.false:
+ %a3 = bitcast <32 x i16> %a to <8 x i64>
br label %end
end:
%phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
@@ -19484,6 +21544,317 @@ end:
}
+define <8 x double> @v_bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) {
+; GCN-LABEL: v_bitcast_v32i16_to_v8f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v38, v14
+; GCN-NEXT: v_mov_b32_e32 v37, v12
+; GCN-NEXT: v_mov_b32_e32 v36, v10
+; GCN-NEXT: v_mov_b32_e32 v35, v8
+; GCN-NEXT: v_mov_b32_e32 v34, v6
+; GCN-NEXT: v_mov_b32_e32 v33, v4
+; GCN-NEXT: v_mov_b32_e32 v32, v2
+; GCN-NEXT: v_mov_b32_e32 v31, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
+; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0
+; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-NEXT: s_cbranch_execnz .LBB134_3
+; GCN-NEXT: ; %bb.1: ; %Flow
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT: s_cbranch_execnz .LBB134_4
+; GCN-NEXT: .LBB134_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN-NEXT: .LBB134_3: ; %cmp.false
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v54
+; GCN-NEXT: v_or_b32_e32 v1, v1, v55
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30
+; GCN-NEXT: v_or_b32_e32 v2, v2, v39
+; GCN-NEXT: v_or_b32_e32 v3, v3, v48
+; GCN-NEXT: v_or_b32_e32 v4, v4, v49
+; GCN-NEXT: v_or_b32_e32 v5, v5, v50
+; GCN-NEXT: v_or_b32_e32 v6, v6, v51
+; GCN-NEXT: v_or_b32_e32 v7, v7, v52
+; GCN-NEXT: v_or_b32_e32 v8, v8, v17
+; GCN-NEXT: v_or_b32_e32 v9, v9, v19
+; GCN-NEXT: v_or_b32_e32 v10, v10, v21
+; GCN-NEXT: v_or_b32_e32 v11, v11, v23
+; GCN-NEXT: v_or_b32_e32 v12, v12, v25
+; GCN-NEXT: v_or_b32_e32 v13, v13, v27
+; GCN-NEXT: v_or_b32_e32 v14, v14, v29
+; GCN-NEXT: v_or_b32_e32 v15, v15, v53
+; GCN-NEXT: ; implicit-def: $vgpr31
+; GCN-NEXT: ; implicit-def: $vgpr32
+; GCN-NEXT: ; implicit-def: $vgpr33
+; GCN-NEXT: ; implicit-def: $vgpr34
+; GCN-NEXT: ; implicit-def: $vgpr35
+; GCN-NEXT: ; implicit-def: $vgpr36
+; GCN-NEXT: ; implicit-def: $vgpr37
+; GCN-NEXT: ; implicit-def: $vgpr38
+; GCN-NEXT: ; implicit-def: $vgpr16
+; GCN-NEXT: ; implicit-def: $vgpr18
+; GCN-NEXT: ; implicit-def: $vgpr20
+; GCN-NEXT: ; implicit-def: $vgpr22
+; GCN-NEXT: ; implicit-def: $vgpr24
+; GCN-NEXT: ; implicit-def: $vgpr26
+; GCN-NEXT: ; implicit-def: $vgpr28
+; GCN-NEXT: ; implicit-def: $vgpr30
+; GCN-NEXT: ; implicit-def: $vgpr54
+; GCN-NEXT: ; implicit-def: $vgpr55
+; GCN-NEXT: ; implicit-def: $vgpr39
+; GCN-NEXT: ; implicit-def: $vgpr48
+; GCN-NEXT: ; implicit-def: $vgpr49
+; GCN-NEXT: ; implicit-def: $vgpr50
+; GCN-NEXT: ; implicit-def: $vgpr51
+; GCN-NEXT: ; implicit-def: $vgpr52
+; GCN-NEXT: ; implicit-def: $vgpr17
+; GCN-NEXT: ; implicit-def: $vgpr19
+; GCN-NEXT: ; implicit-def: $vgpr21
+; GCN-NEXT: ; implicit-def: $vgpr23
+; GCN-NEXT: ; implicit-def: $vgpr25
+; GCN-NEXT: ; implicit-def: $vgpr27
+; GCN-NEXT: ; implicit-def: $vgpr29
+; GCN-NEXT: ; implicit-def: $vgpr53
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB134_2
+; GCN-NEXT: .LBB134_4: ; %cmp.true
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31
+; GCN-NEXT: s_mov_b32 s6, 0x30000
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33
+; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34
+; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35
+; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36
+; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16
+; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18
+; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20
+; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22
+; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26
+; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28
+; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GCN-NEXT: v_or_b32_e32 v0, v54, v0
+; GCN-NEXT: v_or_b32_e32 v1, v55, v1
+; GCN-NEXT: v_or_b32_e32 v2, v39, v2
+; GCN-NEXT: v_or_b32_e32 v3, v48, v3
+; GCN-NEXT: v_or_b32_e32 v4, v49, v4
+; GCN-NEXT: v_or_b32_e32 v5, v50, v5
+; GCN-NEXT: v_or_b32_e32 v6, v51, v6
+; GCN-NEXT: v_or_b32_e32 v7, v52, v7
+; GCN-NEXT: v_or_b32_e32 v8, v17, v8
+; GCN-NEXT: v_or_b32_e32 v9, v19, v9
+; GCN-NEXT: v_or_b32_e32 v10, v21, v10
+; GCN-NEXT: v_or_b32_e32 v11, v23, v11
+; GCN-NEXT: v_or_b32_e32 v12, v25, v12
+; GCN-NEXT: v_or_b32_e32 v13, v27, v13
+; GCN-NEXT: v_or_b32_e32 v14, v29, v14
+; GCN-NEXT: v_or_b32_e32 v15, v53, v15
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1
+; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2
+; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3
+; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4
+; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5
+; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6
+; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7
+; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8
+; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9
+; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10
+; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11
+; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13
+; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14
+; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v32i16_to_v8f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; VI-NEXT: s_cbranch_execz .LBB134_2
+; VI-NEXT: ; %bb.1: ; %cmp.true
+; VI-NEXT: v_mov_b32_e32 v17, 3
+; VI-NEXT: v_add_u16_e32 v16, 3, v15
+; VI-NEXT: v_add_u16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v15, v16, v15
+; VI-NEXT: v_add_u16_e32 v16, 3, v14
+; VI-NEXT: v_add_u16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v14, v16, v14
+; VI-NEXT: v_add_u16_e32 v16, 3, v13
+; VI-NEXT: v_add_u16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v13, v16, v13
+; VI-NEXT: v_add_u16_e32 v16, 3, v12
+; VI-NEXT: v_add_u16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v12, v16, v12
+; VI-NEXT: v_add_u16_e32 v16, 3, v11
+; VI-NEXT: v_add_u16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v11, v16, v11
+; VI-NEXT: v_add_u16_e32 v16, 3, v10
+; VI-NEXT: v_add_u16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v10, v16, v10
+; VI-NEXT: v_add_u16_e32 v16, 3, v9
+; VI-NEXT: v_add_u16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v9, v16, v9
+; VI-NEXT: v_add_u16_e32 v16, 3, v8
+; VI-NEXT: v_add_u16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v8, v16, v8
+; VI-NEXT: v_add_u16_e32 v16, 3, v7
+; VI-NEXT: v_add_u16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v7, v16, v7
+; VI-NEXT: v_add_u16_e32 v16, 3, v6
+; VI-NEXT: v_add_u16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v6, v16, v6
+; VI-NEXT: v_add_u16_e32 v16, 3, v5
+; VI-NEXT: v_add_u16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v5, v16, v5
+; VI-NEXT: v_add_u16_e32 v16, 3, v4
+; VI-NEXT: v_add_u16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v4, v16, v4
+; VI-NEXT: v_add_u16_e32 v16, 3, v3
+; VI-NEXT: v_add_u16_sdwa v3, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v3, v16, v3
+; VI-NEXT: v_add_u16_e32 v16, 3, v2
+; VI-NEXT: v_add_u16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v16, v2
+; VI-NEXT: v_add_u16_e32 v16, 3, v1
+; VI-NEXT: v_add_u16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v1, v16, v1
+; VI-NEXT: v_add_u16_e32 v16, 3, v0
+; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v16, v0
+; VI-NEXT: .LBB134_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v32i16_to_v8f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB134_2
+; GFX9-NEXT: ; %bb.1: ; %cmp.true
+; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: .LBB134_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v32i16_to_v8f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16
+; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT: s_cbranch_execz .LBB134_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-NEXT: .LBB134_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+cmp.true:
+ %a1 = add <32 x i16> %a, splat (i16 3)
+ %a2 = bitcast <32 x i16> %a1 to <8 x double>
+ br label %end
+cmp.false:
+ %a3 = bitcast <32 x i16> %a to <8 x double>
+ br label %end
+end:
+ %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <8 x double> %phi
+}
@@ -19648,7 +22019,7 @@ define void @v_bitcast_v32f32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT: s_cbranch_execz .LBB127_2
+; GCN-NEXT: s_cbranch_execz .LBB135_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: s_waitcnt vmcnt(14)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v63
@@ -19783,7 +22154,7 @@ define void @v_bitcast_v32f32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x
; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v4
; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3
-; GCN-NEXT: .LBB127_2: ; %end
+; GCN-NEXT: .LBB135_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v59
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -20083,7 +22454,7 @@ define void @v_bitcast_v32f32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT: s_cbranch_execz .LBB127_2
+; VI-NEXT: s_cbranch_execz .LBB135_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v50, v18
; VI-NEXT: v_mov_b32_e32 v49, v17
@@ -20118,7 +22489,7 @@ define void @v_bitcast_v32f32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: .LBB127_2: ; %end
+; VI-NEXT: .LBB135_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
@@ -20239,7 +22610,7 @@ define void @v_bitcast_v32f32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB127_2
+; GFX9-NEXT: s_cbranch_execz .LBB135_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v50, v18
; GFX9-NEXT: v_mov_b32_e32 v49, v17
@@ -20275,7 +22646,7 @@ define void @v_bitcast_v32f32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: .LBB127_2: ; %end
+; GFX9-NEXT: .LBB135_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32
@@ -20375,7 +22746,7 @@ define void @v_bitcast_v32f32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x
; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX11-NEXT: s_cbranch_execz .LBB127_2
+; GFX11-NEXT: s_cbranch_execz .LBB135_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -20394,7 +22765,7 @@ define void @v_bitcast_v32f32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x
; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23
; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21
; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19
-; GFX11-NEXT: .LBB127_2: ; %end
+; GFX11-NEXT: .LBB135_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48
@@ -20599,7 +22970,7 @@ define void @v_bitcast_v32i32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT: s_cbranch_execz .LBB128_2
+; GCN-NEXT: s_cbranch_execz .LBB136_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: s_waitcnt vmcnt(14)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v63
@@ -20734,7 +23105,7 @@ define void @v_bitcast_v32i32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x
; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v4
; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3
-; GCN-NEXT: .LBB128_2: ; %end
+; GCN-NEXT: .LBB136_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v59
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -21034,7 +23405,7 @@ define void @v_bitcast_v32i32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT: s_cbranch_execz .LBB128_2
+; VI-NEXT: s_cbranch_execz .LBB136_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v50, v18
; VI-NEXT: v_mov_b32_e32 v49, v17
@@ -21069,7 +23440,7 @@ define void @v_bitcast_v32i32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: .LBB128_2: ; %end
+; VI-NEXT: .LBB136_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
@@ -21190,7 +23561,7 @@ define void @v_bitcast_v32i32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB128_2
+; GFX9-NEXT: s_cbranch_execz .LBB136_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v50, v18
; GFX9-NEXT: v_mov_b32_e32 v49, v17
@@ -21226,7 +23597,7 @@ define void @v_bitcast_v32i32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: .LBB128_2: ; %end
+; GFX9-NEXT: .LBB136_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32
@@ -21326,7 +23697,7 @@ define void @v_bitcast_v32i32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x
; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX11-NEXT: s_cbranch_execz .LBB128_2
+; GFX11-NEXT: s_cbranch_execz .LBB136_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -21345,7 +23716,7 @@ define void @v_bitcast_v32i32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x
; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23
; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21
; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19
-; GFX11-NEXT: .LBB128_2: ; %end
+; GFX11-NEXT: .LBB136_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48
@@ -21640,7 +24011,7 @@ define void @v_bitcast_v64i16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT: s_cbranch_execz .LBB129_2
+; GCN-NEXT: s_cbranch_execz .LBB137_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4
@@ -21836,7 +24207,7 @@ define void @v_bitcast_v64i16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v60
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GCN-NEXT: .LBB129_2: ; %end
+; GCN-NEXT: .LBB137_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v59
@@ -22177,7 +24548,7 @@ define void @v_bitcast_v64i16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT: s_cbranch_execz .LBB129_2
+; VI-NEXT: s_cbranch_execz .LBB137_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v50, v18
; VI-NEXT: v_mov_b32_e32 v49, v17
@@ -22212,7 +24583,7 @@ define void @v_bitcast_v64i16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: .LBB129_2: ; %end
+; VI-NEXT: .LBB137_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
@@ -22333,7 +24704,7 @@ define void @v_bitcast_v64i16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB129_2
+; GFX9-NEXT: s_cbranch_execz .LBB137_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v50, v18
; GFX9-NEXT: v_mov_b32_e32 v49, v17
@@ -22369,7 +24740,7 @@ define void @v_bitcast_v64i16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: .LBB129_2: ; %end
+; GFX9-NEXT: .LBB137_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32
@@ -22469,7 +24840,7 @@ define void @v_bitcast_v64i16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX11-NEXT: s_cbranch_execz .LBB129_2
+; GFX11-NEXT: s_cbranch_execz .LBB137_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -22488,7 +24859,7 @@ define void @v_bitcast_v64i16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23
; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21
; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19
-; GFX11-NEXT: .LBB129_2: ; %end
+; GFX11-NEXT: .LBB137_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48
@@ -22795,7 +25166,7 @@ define void @v_bitcast_v64f16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT: s_cbranch_execz .LBB130_2
+; GCN-NEXT: s_cbranch_execz .LBB138_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_cvt_f16_f32_e32 v57, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v59, v4
@@ -23062,7 +25433,7 @@ define void @v_bitcast_v64f16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x
; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15
; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v14
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v0
-; GCN-NEXT: .LBB130_2: ; %end
+; GCN-NEXT: .LBB138_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(14)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v59
@@ -23384,7 +25755,7 @@ define void @v_bitcast_v64f16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT: s_cbranch_execz .LBB130_2
+; VI-NEXT: s_cbranch_execz .LBB138_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v50, v18
; VI-NEXT: v_mov_b32_e32 v49, v17
@@ -23419,7 +25790,7 @@ define void @v_bitcast_v64f16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: .LBB130_2: ; %end
+; VI-NEXT: .LBB138_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
@@ -23540,7 +25911,7 @@ define void @v_bitcast_v64f16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB130_2
+; GFX9-NEXT: s_cbranch_execz .LBB138_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v50, v18
; GFX9-NEXT: v_mov_b32_e32 v49, v17
@@ -23576,7 +25947,7 @@ define void @v_bitcast_v64f16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: .LBB130_2: ; %end
+; GFX9-NEXT: .LBB138_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32
@@ -23676,7 +26047,7 @@ define void @v_bitcast_v64f16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX11-NEXT: s_cbranch_execz .LBB130_2
+; GFX11-NEXT: s_cbranch_execz .LBB138_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -23695,7 +26066,7 @@ define void @v_bitcast_v64f16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23
; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21
; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19
-; GFX11-NEXT: .LBB130_2: ; %end
+; GFX11-NEXT: .LBB138_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48
@@ -24245,7 +26616,7 @@ define void @v_bitcast_v128i8_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <128
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT: s_cbranch_execz .LBB131_2
+; GCN-NEXT: s_cbranch_execz .LBB139_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xff, v7
; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v8
@@ -24762,7 +27133,7 @@ define void @v_bitcast_v128i8_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <128
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v57
; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; GCN-NEXT: .LBB131_2: ; %end
+; GCN-NEXT: .LBB139_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v12
@@ -25306,7 +27677,7 @@ define void @v_bitcast_v128i8_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <128
; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT: s_cbranch_execz .LBB131_2
+; VI-NEXT: s_cbranch_execz .LBB139_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
@@ -25641,7 +28012,7 @@ define void @v_bitcast_v128i8_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <128
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v62, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: .LBB131_2: ; %end
+; VI-NEXT: .LBB139_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
@@ -26026,7 +28397,7 @@ define void @v_bitcast_v128i8_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <128
; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB131_2
+; GFX9-NEXT: s_cbranch_execz .LBB139_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
@@ -26362,7 +28733,7 @@ define void @v_bitcast_v128i8_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <128
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v62, v3, v0, s6
-; GFX9-NEXT: .LBB131_2: ; %end
+; GFX9-NEXT: .LBB139_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:32
@@ -26588,7 +28959,7 @@ define void @v_bitcast_v128i8_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <128
; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX11-NEXT: s_cbranch_execz .LBB131_2
+; GFX11-NEXT: s_cbranch_execz .LBB139_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4
@@ -26815,7 +29186,7 @@ define void @v_bitcast_v128i8_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <128
; GFX11-NEXT: v_perm_b32 v60, v7, v6, 0x5040100
; GFX11-NEXT: v_perm_b32 v61, v9, v8, 0x5040100
; GFX11-NEXT: v_perm_b32 v62, v11, v10, 0x5040100
-; GFX11-NEXT: .LBB131_2: ; %end
+; GFX11-NEXT: .LBB139_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:48
@@ -27061,7 +29432,7 @@ define void @v_bitcast_v64bf16_to_v64i16(i32 %cond, ptr addrspace(1) %out, <64 x
; GCN-NEXT: v_mov_b32_e32 v62, v31
; GCN-NEXT: v_mov_b32_e32 v63, v31
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT: s_cbranch_execz .LBB132_2
+; GCN-NEXT: s_cbranch_execz .LBB140_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
@@ -27261,7 +29632,7 @@ define void @v_bitcast_v64bf16_to_v64i16(i32 %cond, ptr addrspace(1) %out, <64 x
; GCN-NEXT: v_alignbit_b32 v61, v24, v17, 16
; GCN-NEXT: v_alignbit_b32 v62, v20, v18, 16
; GCN-NEXT: v_alignbit_b32 v63, v21, v19, 16
-; GCN-NEXT: .LBB132_2: ; %end
+; GCN-NEXT: .LBB140_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
@@ -27362,7 +29733,7 @@ define void @v_bitcast_v64bf16_to_v64i16(i32 %cond, ptr addrspace(1) %out, <64 x
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT: s_cbranch_execz .LBB132_2
+; VI-NEXT: s_cbranch_execz .LBB140_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v50, v18
; VI-NEXT: v_mov_b32_e32 v49, v17
@@ -27397,7 +29768,7 @@ define void @v_bitcast_v64bf16_to_v64i16(i32 %cond, ptr addrspace(1) %out, <64 x
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: .LBB132_2: ; %end
+; VI-NEXT: .LBB140_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
@@ -27518,7 +29889,7 @@ define void @v_bitcast_v64bf16_to_v64i16(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB132_2
+; GFX9-NEXT: s_cbranch_execz .LBB140_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v50, v18
; GFX9-NEXT: v_mov_b32_e32 v49, v17
@@ -27554,7 +29925,7 @@ define void @v_bitcast_v64bf16_to_v64i16(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: .LBB132_2: ; %end
+; GFX9-NEXT: .LBB140_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32
@@ -27654,7 +30025,7 @@ define void @v_bitcast_v64bf16_to_v64i16(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX11-NEXT: s_cbranch_execz .LBB132_2
+; GFX11-NEXT: s_cbranch_execz .LBB140_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -27673,7 +30044,7 @@ define void @v_bitcast_v64bf16_to_v64i16(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23
; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21
; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19
-; GFX11-NEXT: .LBB132_2: ; %end
+; GFX11-NEXT: .LBB140_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48
@@ -27982,7 +30353,7 @@ define void @v_bitcast_v64bf16_to_v64f16(i32 %cond, ptr addrspace(1) %out, <64 x
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT: s_cbranch_execz .LBB133_2
+; GCN-NEXT: s_cbranch_execz .LBB141_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
@@ -28473,7 +30844,7 @@ define void @v_bitcast_v64bf16_to_v64f16(i32 %cond, ptr addrspace(1) %out, <64 x
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GCN-NEXT: .LBB133_2: ; %end
+; GCN-NEXT: .LBB141_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v58
@@ -28784,7 +31155,7 @@ define void @v_bitcast_v64bf16_to_v64f16(i32 %cond, ptr addrspace(1) %out, <64 x
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT: s_cbranch_execz .LBB133_2
+; VI-NEXT: s_cbranch_execz .LBB141_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v50, v18
; VI-NEXT: v_mov_b32_e32 v49, v17
@@ -28819,7 +31190,7 @@ define void @v_bitcast_v64bf16_to_v64f16(i32 %cond, ptr addrspace(1) %out, <64 x
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: .LBB133_2: ; %end
+; VI-NEXT: .LBB141_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
@@ -28940,7 +31311,7 @@ define void @v_bitcast_v64bf16_to_v64f16(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB133_2
+; GFX9-NEXT: s_cbranch_execz .LBB141_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v50, v18
; GFX9-NEXT: v_mov_b32_e32 v49, v17
@@ -28976,7 +31347,7 @@ define void @v_bitcast_v64bf16_to_v64f16(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: .LBB133_2: ; %end
+; GFX9-NEXT: .LBB141_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32
@@ -29076,7 +31447,7 @@ define void @v_bitcast_v64bf16_to_v64f16(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX11-NEXT: s_cbranch_execz .LBB133_2
+; GFX11-NEXT: s_cbranch_execz .LBB141_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -29095,7 +31466,7 @@ define void @v_bitcast_v64bf16_to_v64f16(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23
; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21
; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19
-; GFX11-NEXT: .LBB133_2: ; %end
+; GFX11-NEXT: .LBB141_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48
@@ -29305,7 +31676,7 @@ define void @v_bitcast_v64bf16_to_v128i8(i32 %cond, ptr addrspace(1) %out, <64 x
; GCN-NEXT: v_mov_b32_e32 v62, v31
; GCN-NEXT: v_mov_b32_e32 v63, v31
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT: s_cbranch_execz .LBB134_2
+; GCN-NEXT: s_cbranch_execz .LBB142_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
@@ -29505,7 +31876,7 @@ define void @v_bitcast_v64bf16_to_v128i8(i32 %cond, ptr addrspace(1) %out, <64 x
; GCN-NEXT: v_alignbit_b32 v61, v24, v17, 16
; GCN-NEXT: v_alignbit_b32 v62, v20, v18, 16
; GCN-NEXT: v_alignbit_b32 v63, v21, v19, 16
-; GCN-NEXT: .LBB134_2: ; %end
+; GCN-NEXT: .LBB142_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
@@ -29615,7 +31986,7 @@ define void @v_bitcast_v64bf16_to_v128i8(i32 %cond, ptr addrspace(1) %out, <64 x
; VI-NEXT: v_mov_b32_e32 v33, v3
; VI-NEXT: v_mov_b32_e32 v34, v3
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT: s_cbranch_execz .LBB134_2
+; VI-NEXT: s_cbranch_execz .LBB142_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -29649,7 +32020,7 @@ define void @v_bitcast_v64bf16_to_v128i8(i32 %cond, ptr addrspace(1) %out, <64 x
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT: .LBB134_2: ; %end
+; VI-NEXT: .LBB142_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v35, vcc, 0x70, v1
; VI-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
@@ -29758,7 +32129,7 @@ define void @v_bitcast_v64bf16_to_v128i8(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX9-NEXT: v_mov_b32_e32 v33, v3
; GFX9-NEXT: v_mov_b32_e32 v34, v3
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB134_2
+; GFX9-NEXT: s_cbranch_execz .LBB142_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -29792,7 +32163,7 @@ define void @v_bitcast_v64bf16_to_v128i8(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-NEXT: .LBB134_2: ; %end
+; GFX9-NEXT: .LBB142_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:112
@@ -29866,7 +32237,7 @@ define void @v_bitcast_v64bf16_to_v128i8(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX11-NEXT: v_mov_b32_e32 v65, v35
; GFX11-NEXT: v_mov_b32_e32 v66, v35
; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_cbranch_execz .LBB134_2
+; GFX11-NEXT: s_cbranch_execz .LBB142_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v66, v34 :: v_dual_mov_b32 v65, v33
@@ -29885,7 +32256,7 @@ define void @v_bitcast_v64bf16_to_v128i8(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX11-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v39, v7
; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v37, v5
; GFX11-NEXT: v_dual_mov_b32 v36, v4 :: v_dual_mov_b32 v35, v3
-; GFX11-NEXT: .LBB134_2: ; %end
+; GFX11-NEXT: .LBB142_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:112
@@ -30095,7 +32466,7 @@ define void @v_bitcast_v64bf16_to_v16i64(i32 %cond, ptr addrspace(1) %out, <64 x
; GCN-NEXT: v_mov_b32_e32 v62, v31
; GCN-NEXT: v_mov_b32_e32 v63, v31
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT: s_cbranch_execz .LBB135_2
+; GCN-NEXT: s_cbranch_execz .LBB143_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
@@ -30295,7 +32666,7 @@ define void @v_bitcast_v64bf16_to_v16i64(i32 %cond, ptr addrspace(1) %out, <64 x
; GCN-NEXT: v_alignbit_b32 v61, v24, v17, 16
; GCN-NEXT: v_alignbit_b32 v62, v20, v18, 16
; GCN-NEXT: v_alignbit_b32 v63, v21, v19, 16
-; GCN-NEXT: .LBB135_2: ; %end
+; GCN-NEXT: .LBB143_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
@@ -30405,7 +32776,7 @@ define void @v_bitcast_v64bf16_to_v16i64(i32 %cond, ptr addrspace(1) %out, <64 x
; VI-NEXT: v_mov_b32_e32 v33, v3
; VI-NEXT: v_mov_b32_e32 v34, v3
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT: s_cbranch_execz .LBB135_2
+; VI-NEXT: s_cbranch_execz .LBB143_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -30439,7 +32810,7 @@ define void @v_bitcast_v64bf16_to_v16i64(i32 %cond, ptr addrspace(1) %out, <64 x
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT: .LBB135_2: ; %end
+; VI-NEXT: .LBB143_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v35, vcc, 0x70, v1
; VI-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
@@ -30548,7 +32919,7 @@ define void @v_bitcast_v64bf16_to_v16i64(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX9-NEXT: v_mov_b32_e32 v33, v3
; GFX9-NEXT: v_mov_b32_e32 v34, v3
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB135_2
+; GFX9-NEXT: s_cbranch_execz .LBB143_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -30582,7 +32953,7 @@ define void @v_bitcast_v64bf16_to_v16i64(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-NEXT: .LBB135_2: ; %end
+; GFX9-NEXT: .LBB143_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:112
@@ -30656,7 +33027,7 @@ define void @v_bitcast_v64bf16_to_v16i64(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX11-NEXT: v_mov_b32_e32 v65, v35
; GFX11-NEXT: v_mov_b32_e32 v66, v35
; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_cbranch_execz .LBB135_2
+; GFX11-NEXT: s_cbranch_execz .LBB143_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v66, v34 :: v_dual_mov_b32 v65, v33
@@ -30675,7 +33046,7 @@ define void @v_bitcast_v64bf16_to_v16i64(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX11-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v39, v7
; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v37, v5
; GFX11-NEXT: v_dual_mov_b32 v36, v4 :: v_dual_mov_b32 v35, v3
-; GFX11-NEXT: .LBB135_2: ; %end
+; GFX11-NEXT: .LBB143_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:112
@@ -30885,7 +33256,7 @@ define void @v_bitcast_v64bf16_to_v16f64(i32 %cond, ptr addrspace(1) %out, <64 x
; GCN-NEXT: v_mov_b32_e32 v62, v31
; GCN-NEXT: v_mov_b32_e32 v63, v31
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT: s_cbranch_execz .LBB136_2
+; GCN-NEXT: s_cbranch_execz .LBB144_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
@@ -31085,7 +33456,7 @@ define void @v_bitcast_v64bf16_to_v16f64(i32 %cond, ptr addrspace(1) %out, <64 x
; GCN-NEXT: v_alignbit_b32 v61, v24, v17, 16
; GCN-NEXT: v_alignbit_b32 v62, v20, v18, 16
; GCN-NEXT: v_alignbit_b32 v63, v21, v19, 16
-; GCN-NEXT: .LBB136_2: ; %end
+; GCN-NEXT: .LBB144_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
@@ -31195,7 +33566,7 @@ define void @v_bitcast_v64bf16_to_v16f64(i32 %cond, ptr addrspace(1) %out, <64 x
; VI-NEXT: v_mov_b32_e32 v33, v3
; VI-NEXT: v_mov_b32_e32 v34, v3
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT: s_cbranch_execz .LBB136_2
+; VI-NEXT: s_cbranch_execz .LBB144_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -31229,7 +33600,7 @@ define void @v_bitcast_v64bf16_to_v16f64(i32 %cond, ptr addrspace(1) %out, <64 x
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT: .LBB136_2: ; %end
+; VI-NEXT: .LBB144_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v35, vcc, 0x70, v1
; VI-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
@@ -31338,7 +33709,7 @@ define void @v_bitcast_v64bf16_to_v16f64(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX9-NEXT: v_mov_b32_e32 v33, v3
; GFX9-NEXT: v_mov_b32_e32 v34, v3
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB136_2
+; GFX9-NEXT: s_cbranch_execz .LBB144_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -31372,7 +33743,7 @@ define void @v_bitcast_v64bf16_to_v16f64(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-NEXT: .LBB136_2: ; %end
+; GFX9-NEXT: .LBB144_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:112
@@ -31446,7 +33817,7 @@ define void @v_bitcast_v64bf16_to_v16f64(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX11-NEXT: v_mov_b32_e32 v65, v35
; GFX11-NEXT: v_mov_b32_e32 v66, v35
; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_cbranch_execz .LBB136_2
+; GFX11-NEXT: s_cbranch_execz .LBB144_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v66, v34 :: v_dual_mov_b32 v65, v33
@@ -31465,7 +33836,7 @@ define void @v_bitcast_v64bf16_to_v16f64(i32 %cond, ptr addrspace(1) %out, <64 x
; GFX11-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v39, v7
; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v37, v5
; GFX11-NEXT: v_dual_mov_b32 v36, v4 :: v_dual_mov_b32 v35, v3
-; GFX11-NEXT: .LBB136_2: ; %end
+; GFX11-NEXT: .LBB144_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:112
More information about the llvm-commits
mailing list