[llvm] [LegalizeIntegerTypes] Add `PromoteIntOp_ANY_EXTEND_VECTOR_INREG` (PR #178144)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 28 07:03:45 PST 2026
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/178144
>From c550af184ef4fc47962ee13b2677d349ddcd9f93 Mon Sep 17 00:00:00 2001
From: Abhishek Kaushik <abhishek.kaushik at intel.com>
Date: Tue, 27 Jan 2026 01:26:49 -0800
Subject: [PATCH 1/2] Add `PromoteIntOp_ANY_EXTEND_VECTOR_INREG`
---
.../SelectionDAG/LegalizeIntegerTypes.cpp | 14 +
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 +
llvm/test/CodeGen/AArch64/pr161013.ll | 50 +
llvm/test/CodeGen/X86/pr161013.ll | 1124 +++++++++++++++++
4 files changed, 1189 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/pr161013.ll
create mode 100644 llvm/test/CodeGen/X86/pr161013.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 8ce41df6be69b..58e3e74d1a28d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2044,6 +2044,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
report_fatal_error("Do not know how to promote this operator's operand!");
case ISD::ANY_EXTEND: Res = PromoteIntOp_ANY_EXTEND(N); break;
+ case ISD::ANY_EXTEND_VECTOR_INREG:
+ Res = PromoteIntOp_ANY_EXTEND_VECTOR_INREG(N);
+ break;
case ISD::ATOMIC_STORE:
Res = PromoteIntOp_ATOMIC_STORE(cast<AtomicSDNode>(N));
break;
@@ -2284,6 +2287,17 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND(SDNode *N) {
return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), Op);
}
+SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND_VECTOR_INREG(SDNode *N) {
+ SDValue Op = GetPromotedInteger(N->getOperand(0));
+ EVT ResVT = N->getValueType(0);
+ EVT OpVT = Op.getValueType();
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), OpVT.getScalarType(),
+ ResVT.getVectorNumElements());
+ Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Op), NewVT, Op,
+ DAG.getVectorIdxConstant(0, SDLoc(Op)));
+ return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), ResVT, Op);
+}
+
SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) {
SDValue Op1 = GetPromotedInteger(N->getOperand(1));
return DAG.getAtomic(N->getOpcode(), SDLoc(N), N->getMemoryVT(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index a39e419e5ad1c..681ceb22c0ad3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -389,6 +389,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
// Integer Operand Promotion.
bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_ANY_EXTEND(SDNode *N);
+ SDValue PromoteIntOp_ANY_EXTEND_VECTOR_INREG(SDNode *N);
SDValue PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N);
SDValue PromoteIntOp_BITCAST(SDNode *N);
SDValue PromoteIntOp_BUILD_PAIR(SDNode *N);
diff --git a/llvm/test/CodeGen/AArch64/pr161013.ll b/llvm/test/CodeGen/AArch64/pr161013.ll
new file mode 100644
index 0000000000000..5dd4df61aceb2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr161013.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-- < %s | FileCheck %s
+
+define <16 x i4> @avir_v2i4_v16i4(<2 x i4> %arg) {
+; CHECK-LABEL: avir_v2i4_v16i4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h
+; CHECK-NEXT: str d0, [sp, #8]
+; CHECK-NEXT: ldr x8, [sp, #8]
+; CHECK-NEXT: and w10, w8, #0xf
+; CHECK-NEXT: ubfx w9, w8, #4, #4
+; CHECK-NEXT: fmov s0, w10
+; CHECK-NEXT: mov v0.b[1], w9
+; CHECK-NEXT: ubfx w9, w8, #8, #4
+; CHECK-NEXT: mov v0.b[2], w9
+; CHECK-NEXT: ubfx w9, w8, #12, #4
+; CHECK-NEXT: mov v0.b[3], w9
+; CHECK-NEXT: ubfx w9, w8, #16, #4
+; CHECK-NEXT: mov v0.b[4], w9
+; CHECK-NEXT: ubfx w9, w8, #20, #4
+; CHECK-NEXT: mov v0.b[5], w9
+; CHECK-NEXT: ubfx w9, w8, #24, #4
+; CHECK-NEXT: mov v0.b[6], w9
+; CHECK-NEXT: lsr w9, w8, #28
+; CHECK-NEXT: mov v0.b[7], w9
+; CHECK-NEXT: ubfx x9, x8, #32, #4
+; CHECK-NEXT: mov v0.b[8], w9
+; CHECK-NEXT: ubfx x9, x8, #36, #4
+; CHECK-NEXT: mov v0.b[9], w9
+; CHECK-NEXT: ubfx x9, x8, #40, #4
+; CHECK-NEXT: mov v0.b[10], w9
+; CHECK-NEXT: ubfx x9, x8, #44, #4
+; CHECK-NEXT: mov v0.b[11], w9
+; CHECK-NEXT: ubfx x9, x8, #48, #4
+; CHECK-NEXT: mov v0.b[12], w9
+; CHECK-NEXT: ubfx x9, x8, #52, #4
+; CHECK-NEXT: mov v0.b[13], w9
+; CHECK-NEXT: ubfx x9, x8, #56, #4
+; CHECK-NEXT: lsr x8, x8, #60
+; CHECK-NEXT: mov v0.b[14], w9
+; CHECK-NEXT: mov v0.b[15], w8
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
+ %res = shufflevector <2 x i4> %arg, <2 x i4> poison,
+ <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1 , i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ ret <16 x i4> %res
+}
diff --git a/llvm/test/CodeGen/X86/pr161013.ll b/llvm/test/CodeGen/X86/pr161013.ll
new file mode 100644
index 0000000000000..2e805047f5842
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr161013.ll
@@ -0,0 +1,1124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512
+
+
+define <32 x i4> @avir_v4i4_to_v32i4(<4 x i4> %arg) {
+; AVX1-LABEL: avir_v4i4_to_v32i4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX1-NEXT: movl %ecx, %edx
+; AVX1-NEXT: shrl $4, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: movl %ecx, %esi
+; AVX1-NEXT: andl $15, %esi
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0
+; AVX1-NEXT: movl %ecx, %edx
+; AVX1-NEXT: shrl $8, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; AVX1-NEXT: movl %ecx, %edx
+; AVX1-NEXT: shrl $12, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; AVX1-NEXT: movl %ecx, %edx
+; AVX1-NEXT: shrl $16, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0
+; AVX1-NEXT: movl %ecx, %edx
+; AVX1-NEXT: shrl $20, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0
+; AVX1-NEXT: movl %ecx, %edx
+; AVX1-NEXT: shrl $24, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: vpinsrb $6, %edx, %xmm0, %xmm0
+; AVX1-NEXT: movl %ecx, %edx
+; AVX1-NEXT: shrl $28, %edx
+; AVX1-NEXT: vpinsrb $7, %edx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rcx, %rdx
+; AVX1-NEXT: shrq $32, %rdx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rcx, %rdx
+; AVX1-NEXT: shrq $36, %rdx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: vpinsrb $9, %edx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rcx, %rdx
+; AVX1-NEXT: shrq $40, %rdx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: vpinsrb $10, %edx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rcx, %rdx
+; AVX1-NEXT: shrq $44, %rdx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: vpinsrb $11, %edx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rcx, %rdx
+; AVX1-NEXT: shrq $48, %rdx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rcx, %rdx
+; AVX1-NEXT: shrq $52, %rdx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rcx, %rdx
+; AVX1-NEXT: shrq $56, %rdx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: vpinsrb $14, %edx, %xmm0, %xmm0
+; AVX1-NEXT: shrq $60, %rcx
+; AVX1-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $4, %ecx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: vmovd %edx, %xmm1
+; AVX1-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $8, %ecx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $12, %ecx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $16, %ecx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $20, %ecx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $24, %ecx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $28, %ecx
+; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $32, %rcx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $36, %rcx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $40, %rcx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $44, %rcx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $48, %rcx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $52, %rcx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $56, %rcx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shrq $60, %rax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: avir_v4i4_to_v32i4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: movl %ecx, %edx
+; AVX2-NEXT: shrl $4, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: movl %ecx, %esi
+; AVX2-NEXT: andl $15, %esi
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0
+; AVX2-NEXT: movl %ecx, %edx
+; AVX2-NEXT: shrl $8, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; AVX2-NEXT: movl %ecx, %edx
+; AVX2-NEXT: shrl $12, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; AVX2-NEXT: movl %ecx, %edx
+; AVX2-NEXT: shrl $16, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0
+; AVX2-NEXT: movl %ecx, %edx
+; AVX2-NEXT: shrl $20, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0
+; AVX2-NEXT: movl %ecx, %edx
+; AVX2-NEXT: shrl $24, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpinsrb $6, %edx, %xmm0, %xmm0
+; AVX2-NEXT: movl %ecx, %edx
+; AVX2-NEXT: shrl $28, %edx
+; AVX2-NEXT: vpinsrb $7, %edx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rcx, %rdx
+; AVX2-NEXT: shrq $32, %rdx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rcx, %rdx
+; AVX2-NEXT: shrq $36, %rdx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpinsrb $9, %edx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rcx, %rdx
+; AVX2-NEXT: shrq $40, %rdx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpinsrb $10, %edx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rcx, %rdx
+; AVX2-NEXT: shrq $44, %rdx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpinsrb $11, %edx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rcx, %rdx
+; AVX2-NEXT: shrq $48, %rdx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rcx, %rdx
+; AVX2-NEXT: shrq $52, %rdx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rcx, %rdx
+; AVX2-NEXT: shrq $56, %rdx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpinsrb $14, %edx, %xmm0, %xmm0
+; AVX2-NEXT: shrq $60, %rcx
+; AVX2-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $4, %ecx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: movl %eax, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vmovd %edx, %xmm1
+; AVX2-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $8, %ecx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $12, %ecx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $16, %ecx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $20, %ecx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $24, %ecx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $28, %ecx
+; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $32, %rcx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $36, %rcx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $40, %rcx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $44, %rcx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $48, %rcx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $52, %rcx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $56, %rcx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shrq $60, %rax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: avir_v4i4_to_v32i4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movl %ecx, %edx
+; AVX512-NEXT: shrl $4, %edx
+; AVX512-NEXT: andl $15, %edx
+; AVX512-NEXT: movl %ecx, %esi
+; AVX512-NEXT: andl $15, %esi
+; AVX512-NEXT: vmovd %esi, %xmm0
+; AVX512-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0
+; AVX512-NEXT: movl %ecx, %edx
+; AVX512-NEXT: shrl $8, %edx
+; AVX512-NEXT: andl $15, %edx
+; AVX512-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; AVX512-NEXT: movl %ecx, %edx
+; AVX512-NEXT: shrl $12, %edx
+; AVX512-NEXT: andl $15, %edx
+; AVX512-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; AVX512-NEXT: movl %ecx, %edx
+; AVX512-NEXT: shrl $16, %edx
+; AVX512-NEXT: andl $15, %edx
+; AVX512-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0
+; AVX512-NEXT: movl %ecx, %edx
+; AVX512-NEXT: shrl $20, %edx
+; AVX512-NEXT: andl $15, %edx
+; AVX512-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0
+; AVX512-NEXT: movl %ecx, %edx
+; AVX512-NEXT: shrl $24, %edx
+; AVX512-NEXT: andl $15, %edx
+; AVX512-NEXT: vpinsrb $6, %edx, %xmm0, %xmm0
+; AVX512-NEXT: movl %ecx, %edx
+; AVX512-NEXT: shrl $28, %edx
+; AVX512-NEXT: vpinsrb $7, %edx, %xmm0, %xmm0
+; AVX512-NEXT: movq %rcx, %rdx
+; AVX512-NEXT: shrq $32, %rdx
+; AVX512-NEXT: andl $15, %edx
+; AVX512-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0
+; AVX512-NEXT: movq %rcx, %rdx
+; AVX512-NEXT: shrq $36, %rdx
+; AVX512-NEXT: andl $15, %edx
+; AVX512-NEXT: vpinsrb $9, %edx, %xmm0, %xmm0
+; AVX512-NEXT: movq %rcx, %rdx
+; AVX512-NEXT: shrq $40, %rdx
+; AVX512-NEXT: andl $15, %edx
+; AVX512-NEXT: vpinsrb $10, %edx, %xmm0, %xmm0
+; AVX512-NEXT: movq %rcx, %rdx
+; AVX512-NEXT: shrq $44, %rdx
+; AVX512-NEXT: andl $15, %edx
+; AVX512-NEXT: vpinsrb $11, %edx, %xmm0, %xmm0
+; AVX512-NEXT: movq %rcx, %rdx
+; AVX512-NEXT: shrq $48, %rdx
+; AVX512-NEXT: andl $15, %edx
+; AVX512-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; AVX512-NEXT: movq %rcx, %rdx
+; AVX512-NEXT: shrq $52, %rdx
+; AVX512-NEXT: andl $15, %edx
+; AVX512-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
+; AVX512-NEXT: movq %rcx, %rdx
+; AVX512-NEXT: shrq $56, %rdx
+; AVX512-NEXT: andl $15, %edx
+; AVX512-NEXT: vpinsrb $14, %edx, %xmm0, %xmm0
+; AVX512-NEXT: shrq $60, %rcx
+; AVX512-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0
+; AVX512-NEXT: movl %eax, %ecx
+; AVX512-NEXT: shrl $4, %ecx
+; AVX512-NEXT: andl $15, %ecx
+; AVX512-NEXT: movl %eax, %edx
+; AVX512-NEXT: andl $15, %edx
+; AVX512-NEXT: vmovd %edx, %xmm1
+; AVX512-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
+; AVX512-NEXT: movl %eax, %ecx
+; AVX512-NEXT: shrl $8, %ecx
+; AVX512-NEXT: andl $15, %ecx
+; AVX512-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX512-NEXT: movl %eax, %ecx
+; AVX512-NEXT: shrl $12, %ecx
+; AVX512-NEXT: andl $15, %ecx
+; AVX512-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
+; AVX512-NEXT: movl %eax, %ecx
+; AVX512-NEXT: shrl $16, %ecx
+; AVX512-NEXT: andl $15, %ecx
+; AVX512-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
+; AVX512-NEXT: movl %eax, %ecx
+; AVX512-NEXT: shrl $20, %ecx
+; AVX512-NEXT: andl $15, %ecx
+; AVX512-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
+; AVX512-NEXT: movl %eax, %ecx
+; AVX512-NEXT: shrl $24, %ecx
+; AVX512-NEXT: andl $15, %ecx
+; AVX512-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
+; AVX512-NEXT: movl %eax, %ecx
+; AVX512-NEXT: shrl $28, %ecx
+; AVX512-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $32, %rcx
+; AVX512-NEXT: andl $15, %ecx
+; AVX512-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $36, %rcx
+; AVX512-NEXT: andl $15, %ecx
+; AVX512-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $40, %rcx
+; AVX512-NEXT: andl $15, %ecx
+; AVX512-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $44, %rcx
+; AVX512-NEXT: andl $15, %ecx
+; AVX512-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $48, %rcx
+; AVX512-NEXT: andl $15, %ecx
+; AVX512-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $52, %rcx
+; AVX512-NEXT: andl $15, %ecx
+; AVX512-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $56, %rcx
+; AVX512-NEXT: andl $15, %ecx
+; AVX512-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
+; AVX512-NEXT: shrq $60, %rax
+; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: retq
+ %res = shufflevector <4 x i4> %arg, <4 x i4> poison,
+ <32 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1 , i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ ret <32 x i4> %res
+}
+
+define <64 x i4> @avir_v4i4_to_v64i4(<4 x i4> %arg) {
+; AVX-LABEL: avir_v4i4_to_v64i4:
+; AVX: # %bb.0:
+; AVX-NEXT: movq %rdi, %rax
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX-NEXT: retq
+ %res = shufflevector <4 x i4> %arg, <4 x i4> poison,
+ <64 x i32> <i32 0 , i32 poison, i32 1 , i32 poison, i32 2 , i32 poison, i32 3 , i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ ret <64 x i4> %res
+}
+
+define <64 x i4> @avir_v8i4_to_v64i4(<8 x i4> %arg) {
+; AVX-LABEL: avir_v8i4_to_v64i4:
+; AVX: # %bb.0:
+; AVX-NEXT: movq %rdi, %rax
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX-NEXT: retq
+ %res = shufflevector <8 x i4> %arg, <8 x i4> poison,
+ <64 x i32> <i32 0 , i32 poison, i32 1 , i32 poison, i32 2 , i32 poison, i32 3 , i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ ret <64 x i4> %res
+}
+
+define <64 x i4> @avir_v16i4_to_v64i4(<16 x i4> %arg) {
+; AVX-LABEL: avir_v16i4_to_v64i4:
+; AVX: # %bb.0:
+; AVX-NEXT: movq %rdi, %rax
+; AVX-NEXT: vmovaps %xmm0, (%rdi)
+; AVX-NEXT: retq
+ %res = shufflevector <16 x i4> %arg, <16 x i4> poison,
+ <64 x i32> <i32 0 , i32 poison, i32 1 , i32 poison, i32 2 , i32 poison, i32 3 , i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ ret <64 x i4> %res
+}
+
+define <128 x i4> @avir_v4i4_to_v128i4(<4 x i4> %arg) {
+; AVX1-LABEL: avir_v4i4_to_v128i4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: vpextrb $8, %xmm0, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: shlq $32, %rcx
+; AVX1-NEXT: vmovd %xmm0, %esi
+; AVX1-NEXT: andl $15, %esi
+; AVX1-NEXT: vpextrb $1, %xmm0, %edi
+; AVX1-NEXT: andl $15, %edi
+; AVX1-NEXT: shll $4, %edi
+; AVX1-NEXT: orl %esi, %edi
+; AVX1-NEXT: vpextrb $2, %xmm0, %esi
+; AVX1-NEXT: andl $15, %esi
+; AVX1-NEXT: shll $8, %esi
+; AVX1-NEXT: orl %edi, %esi
+; AVX1-NEXT: vpextrb $3, %xmm0, %edi
+; AVX1-NEXT: andl $15, %edi
+; AVX1-NEXT: shll $12, %edi
+; AVX1-NEXT: orl %esi, %edi
+; AVX1-NEXT: shll $16, %edx
+; AVX1-NEXT: orl %edi, %edx
+; AVX1-NEXT: vpextrb $9, %xmm0, %edi
+; AVX1-NEXT: andl $15, %edi
+; AVX1-NEXT: movl %edi, %r8d
+; AVX1-NEXT: shll $20, %r8d
+; AVX1-NEXT: orl %edx, %r8d
+; AVX1-NEXT: vpextrb $12, %xmm0, %esi
+; AVX1-NEXT: andl $15, %esi
+; AVX1-NEXT: movl %esi, %r9d
+; AVX1-NEXT: shll $24, %r9d
+; AVX1-NEXT: vpextrb $13, %xmm0, %edx
+; AVX1-NEXT: movl %edx, %r10d
+; AVX1-NEXT: shll $28, %r10d
+; AVX1-NEXT: orl %r9d, %r10d
+; AVX1-NEXT: orl %r8d, %r10d
+; AVX1-NEXT: orq %rcx, %r10
+; AVX1-NEXT: shlq $36, %rdi
+; AVX1-NEXT: orq %r10, %rdi
+; AVX1-NEXT: movq %rsi, %rcx
+; AVX1-NEXT: shlq $40, %rcx
+; AVX1-NEXT: orq %rdi, %rcx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: movq %rdx, %rdi
+; AVX1-NEXT: shlq $44, %rdi
+; AVX1-NEXT: orq %rcx, %rdi
+; AVX1-NEXT: shlq $48, %rsi
+; AVX1-NEXT: shlq $52, %rdx
+; AVX1-NEXT: orq %rsi, %rdx
+; AVX1-NEXT: vpextrb $14, %xmm0, %ecx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: shlq $56, %rcx
+; AVX1-NEXT: orq %rdx, %rcx
+; AVX1-NEXT: vpextrb $15, %xmm0, %edx
+; AVX1-NEXT: shlq $60, %rdx
+; AVX1-NEXT: orq %rcx, %rdx
+; AVX1-NEXT: orq %rdi, %rdx
+; AVX1-NEXT: movq %rdx, (%rax)
+; AVX1-NEXT: movq $0, 8(%rax)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: avir_v4i4_to_v128i4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrb $8, %xmm1, %ecx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: shlq $32, %rcx
+; AVX2-NEXT: vmovd %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpextrb $1, %xmm1, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shll $4, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: vpextrb $2, %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shll $8, %edx
+; AVX2-NEXT: orl %esi, %edx
+; AVX2-NEXT: vpextrb $3, %xmm1, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shll $12, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: vpextrb $4, %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shll $16, %edx
+; AVX2-NEXT: orl %esi, %edx
+; AVX2-NEXT: vpextrb $5, %xmm1, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shll $20, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: vpextrb $6, %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shll $24, %edx
+; AVX2-NEXT: orl %esi, %edx
+; AVX2-NEXT: vpextrb $7, %xmm1, %esi
+; AVX2-NEXT: shll $28, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: orq %rcx, %rsi
+; AVX2-NEXT: vpextrb $9, %xmm1, %ecx
+; AVX2-NEXT: movzwl %cx, %ecx
+; AVX2-NEXT: shlq $36, %rcx
+; AVX2-NEXT: orq %rsi, %rcx
+; AVX2-NEXT: vpextrb $10, %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $40, %rdx
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: vpextrb $11, %xmm1, %ecx
+; AVX2-NEXT: movzwl %cx, %ecx
+; AVX2-NEXT: shlq $44, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: vpextrb $12, %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $48, %rdx
+; AVX2-NEXT: vpextrb $13, %xmm1, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shlq $52, %rsi
+; AVX2-NEXT: orq %rdx, %rsi
+; AVX2-NEXT: vpextrb $14, %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $56, %rdx
+; AVX2-NEXT: orq %rsi, %rdx
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: vpextrb $15, %xmm1, %ecx
+; AVX2-NEXT: shlq $60, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: movq %rcx, 8(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm0, %ecx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: shlq $32, %rcx
+; AVX2-NEXT: vmovd %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpextrb $1, %xmm0, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shll $4, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: vpextrb $2, %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shll $8, %edx
+; AVX2-NEXT: orl %esi, %edx
+; AVX2-NEXT: vpextrb $3, %xmm0, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shll $12, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: vpextrb $4, %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shll $16, %edx
+; AVX2-NEXT: orl %esi, %edx
+; AVX2-NEXT: vpextrb $5, %xmm0, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shll $20, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: vpextrb $6, %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shll $24, %edx
+; AVX2-NEXT: orl %esi, %edx
+; AVX2-NEXT: vpextrb $7, %xmm0, %esi
+; AVX2-NEXT: shll $28, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: orq %rcx, %rsi
+; AVX2-NEXT: vpextrb $9, %xmm0, %ecx
+; AVX2-NEXT: movzwl %cx, %ecx
+; AVX2-NEXT: shlq $36, %rcx
+; AVX2-NEXT: orq %rsi, %rcx
+; AVX2-NEXT: vpextrb $10, %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $40, %rdx
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: vpextrb $11, %xmm0, %ecx
+; AVX2-NEXT: movzwl %cx, %ecx
+; AVX2-NEXT: shlq $44, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: vpextrb $12, %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $48, %rdx
+; AVX2-NEXT: vpextrb $13, %xmm0, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shlq $52, %rsi
+; AVX2-NEXT: orq %rdx, %rsi
+; AVX2-NEXT: vpextrb $14, %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $56, %rdx
+; AVX2-NEXT: orq %rsi, %rdx
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: vpextrb $15, %xmm0, %ecx
+; AVX2-NEXT: shlq $60, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: movq %rcx, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: avir_v4i4_to_v128i4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movq %rdi, %rax
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX512-NEXT: retq
+ %res = shufflevector <4 x i4> %arg, <4 x i4> poison,
+ <128 x i32> <i32 0 , i32 poison, i32 poison, i32 poison, i32 2 , i32 poison, i32 3 , i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ ret <128 x i4> %res
+}
+
+define <128 x i4> @avir_v8i4_to_v128i4(<8 x i4> %arg) {
+; AVX1-LABEL: avir_v8i4_to_v128i4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: vpextrb $8, %xmm0, %ecx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: shlq $32, %rcx
+; AVX1-NEXT: vmovd %xmm0, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: vpextrb $1, %xmm0, %esi
+; AVX1-NEXT: andl $15, %esi
+; AVX1-NEXT: shll $4, %esi
+; AVX1-NEXT: orl %edx, %esi
+; AVX1-NEXT: vpextrb $2, %xmm0, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: shll $8, %edx
+; AVX1-NEXT: orl %esi, %edx
+; AVX1-NEXT: vpextrb $3, %xmm0, %esi
+; AVX1-NEXT: andl $15, %esi
+; AVX1-NEXT: shll $12, %esi
+; AVX1-NEXT: orl %edx, %esi
+; AVX1-NEXT: vpextrb $4, %xmm0, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: shll $16, %edx
+; AVX1-NEXT: orl %esi, %edx
+; AVX1-NEXT: vpextrb $5, %xmm0, %esi
+; AVX1-NEXT: andl $15, %esi
+; AVX1-NEXT: shll $20, %esi
+; AVX1-NEXT: orl %edx, %esi
+; AVX1-NEXT: vpextrb $6, %xmm0, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: shll $24, %edx
+; AVX1-NEXT: vpextrb $7, %xmm0, %edi
+; AVX1-NEXT: shll $28, %edi
+; AVX1-NEXT: orl %edx, %edi
+; AVX1-NEXT: orl %esi, %edi
+; AVX1-NEXT: orq %rcx, %rdi
+; AVX1-NEXT: vpextrb $9, %xmm0, %ecx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: shlq $36, %rcx
+; AVX1-NEXT: orq %rdi, %rcx
+; AVX1-NEXT: vpextrb $10, %xmm0, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: shlq $40, %rdx
+; AVX1-NEXT: orq %rcx, %rdx
+; AVX1-NEXT: vpextrb $11, %xmm0, %ecx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: shlq $44, %rcx
+; AVX1-NEXT: orq %rdx, %rcx
+; AVX1-NEXT: vpextrb $12, %xmm0, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: shlq $48, %rdx
+; AVX1-NEXT: vpextrb $13, %xmm0, %esi
+; AVX1-NEXT: andl $15, %esi
+; AVX1-NEXT: shlq $52, %rsi
+; AVX1-NEXT: orq %rdx, %rsi
+; AVX1-NEXT: vpextrb $14, %xmm0, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: shlq $56, %rdx
+; AVX1-NEXT: orq %rsi, %rdx
+; AVX1-NEXT: vpextrb $15, %xmm0, %esi
+; AVX1-NEXT: shlq $60, %rsi
+; AVX1-NEXT: orq %rdx, %rsi
+; AVX1-NEXT: orq %rcx, %rsi
+; AVX1-NEXT: movq %rsi, (%rax)
+; AVX1-NEXT: movq $0, 8(%rax)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: avir_v8i4_to_v128i4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrb $8, %xmm1, %ecx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: shlq $32, %rcx
+; AVX2-NEXT: vmovd %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpextrb $1, %xmm1, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shll $4, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: vpextrb $2, %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shll $8, %edx
+; AVX2-NEXT: orl %esi, %edx
+; AVX2-NEXT: vpextrb $3, %xmm1, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shll $12, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: vpextrb $4, %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shll $16, %edx
+; AVX2-NEXT: orl %esi, %edx
+; AVX2-NEXT: vpextrb $5, %xmm1, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shll $20, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: vpextrb $6, %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shll $24, %edx
+; AVX2-NEXT: orl %esi, %edx
+; AVX2-NEXT: vpextrb $7, %xmm1, %esi
+; AVX2-NEXT: shll $28, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: orq %rcx, %rsi
+; AVX2-NEXT: vpextrb $9, %xmm1, %ecx
+; AVX2-NEXT: movzwl %cx, %ecx
+; AVX2-NEXT: shlq $36, %rcx
+; AVX2-NEXT: orq %rsi, %rcx
+; AVX2-NEXT: vpextrb $10, %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $40, %rdx
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: vpextrb $11, %xmm1, %ecx
+; AVX2-NEXT: movzwl %cx, %ecx
+; AVX2-NEXT: shlq $44, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: vpextrb $12, %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $48, %rdx
+; AVX2-NEXT: vpextrb $13, %xmm1, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shlq $52, %rsi
+; AVX2-NEXT: orq %rdx, %rsi
+; AVX2-NEXT: vpextrb $14, %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $56, %rdx
+; AVX2-NEXT: orq %rsi, %rdx
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: vpextrb $15, %xmm1, %ecx
+; AVX2-NEXT: shlq $60, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: movq %rcx, 8(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm0, %ecx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: shlq $32, %rcx
+; AVX2-NEXT: vmovd %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpextrb $1, %xmm0, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shll $4, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: vpextrb $2, %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shll $8, %edx
+; AVX2-NEXT: orl %esi, %edx
+; AVX2-NEXT: vpextrb $3, %xmm0, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shll $12, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: vpextrb $4, %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shll $16, %edx
+; AVX2-NEXT: orl %esi, %edx
+; AVX2-NEXT: vpextrb $5, %xmm0, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shll $20, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: vpextrb $6, %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shll $24, %edx
+; AVX2-NEXT: orl %esi, %edx
+; AVX2-NEXT: vpextrb $7, %xmm0, %esi
+; AVX2-NEXT: shll $28, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: orq %rcx, %rsi
+; AVX2-NEXT: vpextrb $9, %xmm0, %ecx
+; AVX2-NEXT: movzwl %cx, %ecx
+; AVX2-NEXT: shlq $36, %rcx
+; AVX2-NEXT: orq %rsi, %rcx
+; AVX2-NEXT: vpextrb $10, %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $40, %rdx
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: vpextrb $11, %xmm0, %ecx
+; AVX2-NEXT: movzwl %cx, %ecx
+; AVX2-NEXT: shlq $44, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: vpextrb $12, %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $48, %rdx
+; AVX2-NEXT: vpextrb $13, %xmm0, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shlq $52, %rsi
+; AVX2-NEXT: orq %rdx, %rsi
+; AVX2-NEXT: vpextrb $14, %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $56, %rdx
+; AVX2-NEXT: orq %rsi, %rdx
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: vpextrb $15, %xmm0, %ecx
+; AVX2-NEXT: shlq $60, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: movq %rcx, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: avir_v8i4_to_v128i4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movq %rdi, %rax
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX512-NEXT: retq
+ %res = shufflevector <8 x i4> %arg, <8 x i4> poison,
+ <128 x i32> <i32 0 , i32 poison, i32 poison, i32 poison, i32 2 , i32 poison, i32 3 , i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ ret <128 x i4> %res
+}
+
+define <128 x i4> @avir_v16i4_to_v128i4(<16 x i4> %arg) {
+; AVX1-LABEL: avir_v16i4_to_v128i4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: vpextrb $12, %xmm0, %ecx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: shlq $32, %rcx
+; AVX1-NEXT: vpextrd $2, %xmm0, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: vpextrb $9, %xmm0, %esi
+; AVX1-NEXT: andl $15, %esi
+; AVX1-NEXT: shll $8, %esi
+; AVX1-NEXT: orl %edx, %esi
+; AVX1-NEXT: vpextrb $10, %xmm0, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: shll $16, %edx
+; AVX1-NEXT: orl %esi, %edx
+; AVX1-NEXT: vpextrb $11, %xmm0, %esi
+; AVX1-NEXT: andl $15, %esi
+; AVX1-NEXT: shll $24, %esi
+; AVX1-NEXT: orl %edx, %esi
+; AVX1-NEXT: orq %rcx, %rsi
+; AVX1-NEXT: vpextrb $13, %xmm0, %ecx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: shlq $40, %rcx
+; AVX1-NEXT: orq %rsi, %rcx
+; AVX1-NEXT: vpextrb $14, %xmm0, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: shlq $48, %rdx
+; AVX1-NEXT: orq %rcx, %rdx
+; AVX1-NEXT: vpextrb $15, %xmm0, %ecx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: shlq $56, %rcx
+; AVX1-NEXT: orq %rdx, %rcx
+; AVX1-NEXT: movq %rcx, 8(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm0, %ecx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: shlq $32, %rcx
+; AVX1-NEXT: vmovd %xmm0, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: vpextrb $1, %xmm0, %esi
+; AVX1-NEXT: andl $15, %esi
+; AVX1-NEXT: shll $8, %esi
+; AVX1-NEXT: orl %edx, %esi
+; AVX1-NEXT: vpextrb $2, %xmm0, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: shll $16, %edx
+; AVX1-NEXT: orl %esi, %edx
+; AVX1-NEXT: vpextrb $3, %xmm0, %esi
+; AVX1-NEXT: andl $15, %esi
+; AVX1-NEXT: shll $24, %esi
+; AVX1-NEXT: orl %edx, %esi
+; AVX1-NEXT: orq %rcx, %rsi
+; AVX1-NEXT: vpextrb $5, %xmm0, %ecx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: shlq $40, %rcx
+; AVX1-NEXT: orq %rsi, %rcx
+; AVX1-NEXT: vpextrb $6, %xmm0, %edx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: shlq $48, %rdx
+; AVX1-NEXT: orq %rcx, %rdx
+; AVX1-NEXT: vpextrb $7, %xmm0, %ecx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: shlq $56, %rcx
+; AVX1-NEXT: orq %rdx, %rcx
+; AVX1-NEXT: movq %rcx, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: avir_v16i4_to_v128i4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrb $8, %xmm1, %ecx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: shlq $32, %rcx
+; AVX2-NEXT: vmovd %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpextrb $1, %xmm1, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shll $4, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: vpextrb $2, %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shll $8, %edx
+; AVX2-NEXT: orl %esi, %edx
+; AVX2-NEXT: vpextrb $3, %xmm1, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shll $12, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: vpextrb $4, %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shll $16, %edx
+; AVX2-NEXT: orl %esi, %edx
+; AVX2-NEXT: vpextrb $5, %xmm1, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shll $20, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: vpextrb $6, %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shll $24, %edx
+; AVX2-NEXT: orl %esi, %edx
+; AVX2-NEXT: vpextrb $7, %xmm1, %esi
+; AVX2-NEXT: shll $28, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: orq %rcx, %rsi
+; AVX2-NEXT: vpextrb $9, %xmm1, %ecx
+; AVX2-NEXT: movzwl %cx, %ecx
+; AVX2-NEXT: shlq $36, %rcx
+; AVX2-NEXT: orq %rsi, %rcx
+; AVX2-NEXT: vpextrb $10, %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $40, %rdx
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: vpextrb $11, %xmm1, %ecx
+; AVX2-NEXT: movzwl %cx, %ecx
+; AVX2-NEXT: shlq $44, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: vpextrb $12, %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $48, %rdx
+; AVX2-NEXT: vpextrb $13, %xmm1, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shlq $52, %rsi
+; AVX2-NEXT: orq %rdx, %rsi
+; AVX2-NEXT: vpextrb $14, %xmm1, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $56, %rdx
+; AVX2-NEXT: orq %rsi, %rdx
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: vpextrb $15, %xmm1, %ecx
+; AVX2-NEXT: shlq $60, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: movq %rcx, 8(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm0, %ecx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: shlq $32, %rcx
+; AVX2-NEXT: vmovd %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpextrb $1, %xmm0, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shll $4, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: vpextrb $2, %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shll $8, %edx
+; AVX2-NEXT: orl %esi, %edx
+; AVX2-NEXT: vpextrb $3, %xmm0, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shll $12, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: vpextrb $4, %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shll $16, %edx
+; AVX2-NEXT: orl %esi, %edx
+; AVX2-NEXT: vpextrb $5, %xmm0, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shll $20, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: vpextrb $6, %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shll $24, %edx
+; AVX2-NEXT: orl %esi, %edx
+; AVX2-NEXT: vpextrb $7, %xmm0, %esi
+; AVX2-NEXT: shll $28, %esi
+; AVX2-NEXT: orl %edx, %esi
+; AVX2-NEXT: orq %rcx, %rsi
+; AVX2-NEXT: vpextrb $9, %xmm0, %ecx
+; AVX2-NEXT: movzwl %cx, %ecx
+; AVX2-NEXT: shlq $36, %rcx
+; AVX2-NEXT: orq %rsi, %rcx
+; AVX2-NEXT: vpextrb $10, %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $40, %rdx
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: vpextrb $11, %xmm0, %ecx
+; AVX2-NEXT: movzwl %cx, %ecx
+; AVX2-NEXT: shlq $44, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: vpextrb $12, %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $48, %rdx
+; AVX2-NEXT: vpextrb $13, %xmm0, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: shlq $52, %rsi
+; AVX2-NEXT: orq %rdx, %rsi
+; AVX2-NEXT: vpextrb $14, %xmm0, %edx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $56, %rdx
+; AVX2-NEXT: orq %rsi, %rdx
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: vpextrb $15, %xmm0, %ecx
+; AVX2-NEXT: shlq $60, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: movq %rcx, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: avir_v16i4_to_v128i4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movq %rdi, %rax
+; AVX512-NEXT: vmovaps %xmm0, (%rdi)
+; AVX512-NEXT: retq
+ %res = shufflevector <16 x i4> %arg, <16 x i4> poison,
+ <128 x i32> <i32 0 , i32 poison, i32 poison, i32 poison, i32 2 , i32 poison, i32 3 , i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ ret <128 x i4> %res
+}
>From 50f76f0fd20c5a3009cc5ccfe3324633e00201a9 Mon Sep 17 00:00:00 2001
From: Abhishek Kaushik <abhishek.kaushik at intel.com>
Date: Wed, 28 Jan 2026 13:49:07 +0530
Subject: [PATCH 2/2] Use `nounwind` and `getExtractSubVector`
---
llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 3 +--
llvm/test/CodeGen/AArch64/pr161013.ll | 3 +--
2 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 58e3e74d1a28d..5b32c5f945a75 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2293,8 +2293,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND_VECTOR_INREG(SDNode *N) {
EVT OpVT = Op.getValueType();
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), OpVT.getScalarType(),
ResVT.getVectorNumElements());
- Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Op), NewVT, Op,
- DAG.getVectorIdxConstant(0, SDLoc(Op)));
+ Op = DAG.getExtractSubvector(SDLoc(Op), NewVT, Op, 0);
return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), ResVT, Op);
}
diff --git a/llvm/test/CodeGen/AArch64/pr161013.ll b/llvm/test/CodeGen/AArch64/pr161013.ll
index 5dd4df61aceb2..d163914f1ac0e 100644
--- a/llvm/test/CodeGen/AArch64/pr161013.ll
+++ b/llvm/test/CodeGen/AArch64/pr161013.ll
@@ -1,11 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=aarch64-- < %s | FileCheck %s
-define <16 x i4> @avir_v2i4_v16i4(<2 x i4> %arg) {
+define <16 x i4> @avir_v2i4_v16i4(<2 x i4> %arg) nounwind {
; CHECK-LABEL: avir_v2i4_v16i4:
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-NEXT: str d0, [sp, #8]
; CHECK-NEXT: ldr x8, [sp, #8]
More information about the llvm-commits
mailing list