[llvm] [LegalizeIntegerTypes] Add `PromoteIntOp_ANY_EXTEND_VECTOR_INREG` (PR #178144)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 28 07:03:45 PST 2026


https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/178144

>From c550af184ef4fc47962ee13b2677d349ddcd9f93 Mon Sep 17 00:00:00 2001
From: Abhishek Kaushik <abhishek.kaushik at intel.com>
Date: Tue, 27 Jan 2026 01:26:49 -0800
Subject: [PATCH 1/2] Add `PromoteIntOp_ANY_EXTEND_VECTOR_INREG`

---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |   14 +
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |    1 +
 llvm/test/CodeGen/AArch64/pr161013.ll         |   50 +
 llvm/test/CodeGen/X86/pr161013.ll             | 1124 +++++++++++++++++
 4 files changed, 1189 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/pr161013.ll
 create mode 100644 llvm/test/CodeGen/X86/pr161013.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 8ce41df6be69b..58e3e74d1a28d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2044,6 +2044,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
     report_fatal_error("Do not know how to promote this operator's operand!");
 
   case ISD::ANY_EXTEND:   Res = PromoteIntOp_ANY_EXTEND(N); break;
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+    Res = PromoteIntOp_ANY_EXTEND_VECTOR_INREG(N);
+    break;
   case ISD::ATOMIC_STORE:
     Res = PromoteIntOp_ATOMIC_STORE(cast<AtomicSDNode>(N));
     break;
@@ -2284,6 +2287,17 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND(SDNode *N) {
   return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), Op);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND_VECTOR_INREG(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  EVT ResVT = N->getValueType(0);
+  EVT OpVT = Op.getValueType();
+  EVT NewVT = EVT::getVectorVT(*DAG.getContext(), OpVT.getScalarType(),
+                               ResVT.getVectorNumElements());
+  Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Op), NewVT, Op,
+                   DAG.getVectorIdxConstant(0, SDLoc(Op)));
+  return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), ResVT, Op);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) {
   SDValue Op1 = GetPromotedInteger(N->getOperand(1));
   return DAG.getAtomic(N->getOpcode(), SDLoc(N), N->getMemoryVT(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index a39e419e5ad1c..681ceb22c0ad3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -389,6 +389,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   // Integer Operand Promotion.
   bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_ANY_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_ANY_EXTEND_VECTOR_INREG(SDNode *N);
   SDValue PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N);
   SDValue PromoteIntOp_BITCAST(SDNode *N);
   SDValue PromoteIntOp_BUILD_PAIR(SDNode *N);
diff --git a/llvm/test/CodeGen/AArch64/pr161013.ll b/llvm/test/CodeGen/AArch64/pr161013.ll
new file mode 100644
index 0000000000000..5dd4df61aceb2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr161013.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-- < %s | FileCheck %s
+
+define <16 x i4> @avir_v2i4_v16i4(<2 x i4> %arg) {
+; CHECK-LABEL: avir_v2i4_v16i4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
+; CHECK-NEXT:    str d0, [sp, #8]
+; CHECK-NEXT:    ldr x8, [sp, #8]
+; CHECK-NEXT:    and w10, w8, #0xf
+; CHECK-NEXT:    ubfx w9, w8, #4, #4
+; CHECK-NEXT:    fmov s0, w10
+; CHECK-NEXT:    mov v0.b[1], w9
+; CHECK-NEXT:    ubfx w9, w8, #8, #4
+; CHECK-NEXT:    mov v0.b[2], w9
+; CHECK-NEXT:    ubfx w9, w8, #12, #4
+; CHECK-NEXT:    mov v0.b[3], w9
+; CHECK-NEXT:    ubfx w9, w8, #16, #4
+; CHECK-NEXT:    mov v0.b[4], w9
+; CHECK-NEXT:    ubfx w9, w8, #20, #4
+; CHECK-NEXT:    mov v0.b[5], w9
+; CHECK-NEXT:    ubfx w9, w8, #24, #4
+; CHECK-NEXT:    mov v0.b[6], w9
+; CHECK-NEXT:    lsr w9, w8, #28
+; CHECK-NEXT:    mov v0.b[7], w9
+; CHECK-NEXT:    ubfx x9, x8, #32, #4
+; CHECK-NEXT:    mov v0.b[8], w9
+; CHECK-NEXT:    ubfx x9, x8, #36, #4
+; CHECK-NEXT:    mov v0.b[9], w9
+; CHECK-NEXT:    ubfx x9, x8, #40, #4
+; CHECK-NEXT:    mov v0.b[10], w9
+; CHECK-NEXT:    ubfx x9, x8, #44, #4
+; CHECK-NEXT:    mov v0.b[11], w9
+; CHECK-NEXT:    ubfx x9, x8, #48, #4
+; CHECK-NEXT:    mov v0.b[12], w9
+; CHECK-NEXT:    ubfx x9, x8, #52, #4
+; CHECK-NEXT:    mov v0.b[13], w9
+; CHECK-NEXT:    ubfx x9, x8, #56, #4
+; CHECK-NEXT:    lsr x8, x8, #60
+; CHECK-NEXT:    mov v0.b[14], w9
+; CHECK-NEXT:    mov v0.b[15], w8
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %res = shufflevector <2 x i4> %arg, <2 x i4> poison,
+  <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1     , i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret <16 x i4> %res
+}
diff --git a/llvm/test/CodeGen/X86/pr161013.ll b/llvm/test/CodeGen/X86/pr161013.ll
new file mode 100644
index 0000000000000..2e805047f5842
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr161013.ll
@@ -0,0 +1,1124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx              | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2             | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f          | FileCheck %s --check-prefixes=AVX,AVX512
+
+
+define <32 x i4> @avir_v4i4_to_v32i4(<4 x i4> %arg) {
+; AVX1-LABEL: avir_v4i4_to_v32i4:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX1-NEXT:    movl %ecx, %edx
+; AVX1-NEXT:    shrl $4, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    movl %ecx, %esi
+; AVX1-NEXT:    andl $15, %esi
+; AVX1-NEXT:    vmovd %esi, %xmm0
+; AVX1-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
+; AVX1-NEXT:    movl %ecx, %edx
+; AVX1-NEXT:    shrl $8, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; AVX1-NEXT:    movl %ecx, %edx
+; AVX1-NEXT:    shrl $12, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpinsrb $3, %edx, %xmm0, %xmm0
+; AVX1-NEXT:    movl %ecx, %edx
+; AVX1-NEXT:    shrl $16, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpinsrb $4, %edx, %xmm0, %xmm0
+; AVX1-NEXT:    movl %ecx, %edx
+; AVX1-NEXT:    shrl $20, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpinsrb $5, %edx, %xmm0, %xmm0
+; AVX1-NEXT:    movl %ecx, %edx
+; AVX1-NEXT:    shrl $24, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpinsrb $6, %edx, %xmm0, %xmm0
+; AVX1-NEXT:    movl %ecx, %edx
+; AVX1-NEXT:    shrl $28, %edx
+; AVX1-NEXT:    vpinsrb $7, %edx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    shrq $32, %rdx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpinsrb $8, %edx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    shrq $36, %rdx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpinsrb $9, %edx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    shrq $40, %rdx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpinsrb $10, %edx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    shrq $44, %rdx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpinsrb $11, %edx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    shrq $48, %rdx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    shrq $52, %rdx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    shrq $56, %rdx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpinsrb $14, %edx, %xmm0, %xmm0
+; AVX1-NEXT:    shrq $60, %rcx
+; AVX1-NEXT:    vpinsrb $15, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movl %eax, %ecx
+; AVX1-NEXT:    shrl $4, %ecx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    movl %eax, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vmovd %edx, %xmm1
+; AVX1-NEXT:    vpinsrb $1, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movl %eax, %ecx
+; AVX1-NEXT:    shrl $8, %ecx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movl %eax, %ecx
+; AVX1-NEXT:    shrl $12, %ecx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    vpinsrb $3, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movl %eax, %ecx
+; AVX1-NEXT:    shrl $16, %ecx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    vpinsrb $4, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movl %eax, %ecx
+; AVX1-NEXT:    shrl $20, %ecx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    vpinsrb $5, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movl %eax, %ecx
+; AVX1-NEXT:    shrl $24, %ecx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    vpinsrb $6, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movl %eax, %ecx
+; AVX1-NEXT:    shrl $28, %ecx
+; AVX1-NEXT:    vpinsrb $7, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq $32, %rcx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    vpinsrb $8, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq $36, %rcx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    vpinsrb $9, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq $40, %rcx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    vpinsrb $10, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq $44, %rcx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    vpinsrb $11, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq $48, %rcx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    vpinsrb $12, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq $52, %rcx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    vpinsrb $13, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq $56, %rcx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    vpinsrb $14, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    shrq $60, %rax
+; AVX1-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: avir_v4i4_to_v32i4:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT:    movl %ecx, %edx
+; AVX2-NEXT:    shrl $4, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    movl %ecx, %esi
+; AVX2-NEXT:    andl $15, %esi
+; AVX2-NEXT:    vmovd %esi, %xmm0
+; AVX2-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
+; AVX2-NEXT:    movl %ecx, %edx
+; AVX2-NEXT:    shrl $8, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; AVX2-NEXT:    movl %ecx, %edx
+; AVX2-NEXT:    shrl $12, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpinsrb $3, %edx, %xmm0, %xmm0
+; AVX2-NEXT:    movl %ecx, %edx
+; AVX2-NEXT:    shrl $16, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpinsrb $4, %edx, %xmm0, %xmm0
+; AVX2-NEXT:    movl %ecx, %edx
+; AVX2-NEXT:    shrl $20, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpinsrb $5, %edx, %xmm0, %xmm0
+; AVX2-NEXT:    movl %ecx, %edx
+; AVX2-NEXT:    shrl $24, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpinsrb $6, %edx, %xmm0, %xmm0
+; AVX2-NEXT:    movl %ecx, %edx
+; AVX2-NEXT:    shrl $28, %edx
+; AVX2-NEXT:    vpinsrb $7, %edx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    shrq $32, %rdx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpinsrb $8, %edx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    shrq $36, %rdx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpinsrb $9, %edx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    shrq $40, %rdx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpinsrb $10, %edx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    shrq $44, %rdx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpinsrb $11, %edx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    shrq $48, %rdx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    shrq $52, %rdx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    shrq $56, %rdx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpinsrb $14, %edx, %xmm0, %xmm0
+; AVX2-NEXT:    shrq $60, %rcx
+; AVX2-NEXT:    vpinsrb $15, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    shrl $4, %ecx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    movl %eax, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vmovd %edx, %xmm1
+; AVX2-NEXT:    vpinsrb $1, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    shrl $8, %ecx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    shrl $12, %ecx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpinsrb $3, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    shrl $16, %ecx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpinsrb $4, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    shrl $20, %ecx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpinsrb $5, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    shrl $24, %ecx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpinsrb $6, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    shrl $28, %ecx
+; AVX2-NEXT:    vpinsrb $7, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq $32, %rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpinsrb $8, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq $36, %rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpinsrb $9, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq $40, %rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpinsrb $10, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq $44, %rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpinsrb $11, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq $48, %rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpinsrb $12, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq $52, %rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpinsrb $13, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq $56, %rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpinsrb $14, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    shrq $60, %rax
+; AVX2-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: avir_v4i4_to_v32i4:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movl %ecx, %edx
+; AVX512-NEXT:    shrl $4, %edx
+; AVX512-NEXT:    andl $15, %edx
+; AVX512-NEXT:    movl %ecx, %esi
+; AVX512-NEXT:    andl $15, %esi
+; AVX512-NEXT:    vmovd %esi, %xmm0
+; AVX512-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
+; AVX512-NEXT:    movl %ecx, %edx
+; AVX512-NEXT:    shrl $8, %edx
+; AVX512-NEXT:    andl $15, %edx
+; AVX512-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; AVX512-NEXT:    movl %ecx, %edx
+; AVX512-NEXT:    shrl $12, %edx
+; AVX512-NEXT:    andl $15, %edx
+; AVX512-NEXT:    vpinsrb $3, %edx, %xmm0, %xmm0
+; AVX512-NEXT:    movl %ecx, %edx
+; AVX512-NEXT:    shrl $16, %edx
+; AVX512-NEXT:    andl $15, %edx
+; AVX512-NEXT:    vpinsrb $4, %edx, %xmm0, %xmm0
+; AVX512-NEXT:    movl %ecx, %edx
+; AVX512-NEXT:    shrl $20, %edx
+; AVX512-NEXT:    andl $15, %edx
+; AVX512-NEXT:    vpinsrb $5, %edx, %xmm0, %xmm0
+; AVX512-NEXT:    movl %ecx, %edx
+; AVX512-NEXT:    shrl $24, %edx
+; AVX512-NEXT:    andl $15, %edx
+; AVX512-NEXT:    vpinsrb $6, %edx, %xmm0, %xmm0
+; AVX512-NEXT:    movl %ecx, %edx
+; AVX512-NEXT:    shrl $28, %edx
+; AVX512-NEXT:    vpinsrb $7, %edx, %xmm0, %xmm0
+; AVX512-NEXT:    movq %rcx, %rdx
+; AVX512-NEXT:    shrq $32, %rdx
+; AVX512-NEXT:    andl $15, %edx
+; AVX512-NEXT:    vpinsrb $8, %edx, %xmm0, %xmm0
+; AVX512-NEXT:    movq %rcx, %rdx
+; AVX512-NEXT:    shrq $36, %rdx
+; AVX512-NEXT:    andl $15, %edx
+; AVX512-NEXT:    vpinsrb $9, %edx, %xmm0, %xmm0
+; AVX512-NEXT:    movq %rcx, %rdx
+; AVX512-NEXT:    shrq $40, %rdx
+; AVX512-NEXT:    andl $15, %edx
+; AVX512-NEXT:    vpinsrb $10, %edx, %xmm0, %xmm0
+; AVX512-NEXT:    movq %rcx, %rdx
+; AVX512-NEXT:    shrq $44, %rdx
+; AVX512-NEXT:    andl $15, %edx
+; AVX512-NEXT:    vpinsrb $11, %edx, %xmm0, %xmm0
+; AVX512-NEXT:    movq %rcx, %rdx
+; AVX512-NEXT:    shrq $48, %rdx
+; AVX512-NEXT:    andl $15, %edx
+; AVX512-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; AVX512-NEXT:    movq %rcx, %rdx
+; AVX512-NEXT:    shrq $52, %rdx
+; AVX512-NEXT:    andl $15, %edx
+; AVX512-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
+; AVX512-NEXT:    movq %rcx, %rdx
+; AVX512-NEXT:    shrq $56, %rdx
+; AVX512-NEXT:    andl $15, %edx
+; AVX512-NEXT:    vpinsrb $14, %edx, %xmm0, %xmm0
+; AVX512-NEXT:    shrq $60, %rcx
+; AVX512-NEXT:    vpinsrb $15, %ecx, %xmm0, %xmm0
+; AVX512-NEXT:    movl %eax, %ecx
+; AVX512-NEXT:    shrl $4, %ecx
+; AVX512-NEXT:    andl $15, %ecx
+; AVX512-NEXT:    movl %eax, %edx
+; AVX512-NEXT:    andl $15, %edx
+; AVX512-NEXT:    vmovd %edx, %xmm1
+; AVX512-NEXT:    vpinsrb $1, %ecx, %xmm1, %xmm1
+; AVX512-NEXT:    movl %eax, %ecx
+; AVX512-NEXT:    shrl $8, %ecx
+; AVX512-NEXT:    andl $15, %ecx
+; AVX512-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX512-NEXT:    movl %eax, %ecx
+; AVX512-NEXT:    shrl $12, %ecx
+; AVX512-NEXT:    andl $15, %ecx
+; AVX512-NEXT:    vpinsrb $3, %ecx, %xmm1, %xmm1
+; AVX512-NEXT:    movl %eax, %ecx
+; AVX512-NEXT:    shrl $16, %ecx
+; AVX512-NEXT:    andl $15, %ecx
+; AVX512-NEXT:    vpinsrb $4, %ecx, %xmm1, %xmm1
+; AVX512-NEXT:    movl %eax, %ecx
+; AVX512-NEXT:    shrl $20, %ecx
+; AVX512-NEXT:    andl $15, %ecx
+; AVX512-NEXT:    vpinsrb $5, %ecx, %xmm1, %xmm1
+; AVX512-NEXT:    movl %eax, %ecx
+; AVX512-NEXT:    shrl $24, %ecx
+; AVX512-NEXT:    andl $15, %ecx
+; AVX512-NEXT:    vpinsrb $6, %ecx, %xmm1, %xmm1
+; AVX512-NEXT:    movl %eax, %ecx
+; AVX512-NEXT:    shrl $28, %ecx
+; AVX512-NEXT:    vpinsrb $7, %ecx, %xmm1, %xmm1
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    shrq $32, %rcx
+; AVX512-NEXT:    andl $15, %ecx
+; AVX512-NEXT:    vpinsrb $8, %ecx, %xmm1, %xmm1
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    shrq $36, %rcx
+; AVX512-NEXT:    andl $15, %ecx
+; AVX512-NEXT:    vpinsrb $9, %ecx, %xmm1, %xmm1
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    shrq $40, %rcx
+; AVX512-NEXT:    andl $15, %ecx
+; AVX512-NEXT:    vpinsrb $10, %ecx, %xmm1, %xmm1
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    shrq $44, %rcx
+; AVX512-NEXT:    andl $15, %ecx
+; AVX512-NEXT:    vpinsrb $11, %ecx, %xmm1, %xmm1
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    shrq $48, %rcx
+; AVX512-NEXT:    andl $15, %ecx
+; AVX512-NEXT:    vpinsrb $12, %ecx, %xmm1, %xmm1
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    shrq $52, %rcx
+; AVX512-NEXT:    andl $15, %ecx
+; AVX512-NEXT:    vpinsrb $13, %ecx, %xmm1, %xmm1
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    shrq $56, %rcx
+; AVX512-NEXT:    andl $15, %ecx
+; AVX512-NEXT:    vpinsrb $14, %ecx, %xmm1, %xmm1
+; AVX512-NEXT:    shrq $60, %rax
+; AVX512-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %res = shufflevector <4 x i4> %arg, <4 x i4> poison,
+  <32 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1     , i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret <32 x i4> %res
+}
+
+define <64 x i4> @avir_v4i4_to_v64i4(<4 x i4> %arg) {
+; AVX-LABEL: avir_v4i4_to_v64i4:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movq %rdi, %rax
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX-NEXT:    retq
+  %res = shufflevector <4 x i4> %arg, <4 x i4> poison,
+  <64 x i32> <i32 0     , i32 poison, i32 1     , i32 poison, i32 2     , i32 poison, i32 3     , i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret <64 x i4> %res
+}
+
+define <64 x i4> @avir_v8i4_to_v64i4(<8 x i4> %arg) {
+; AVX-LABEL: avir_v8i4_to_v64i4:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movq %rdi, %rax
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX-NEXT:    retq
+  %res = shufflevector <8 x i4> %arg, <8 x i4> poison,
+  <64 x i32> <i32 0     , i32 poison, i32 1     , i32 poison, i32 2     , i32 poison, i32 3     , i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret <64 x i4> %res
+}
+
+define <64 x i4> @avir_v16i4_to_v64i4(<16 x i4> %arg) {
+; AVX-LABEL: avir_v16i4_to_v64i4:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movq %rdi, %rax
+; AVX-NEXT:    vmovaps %xmm0, (%rdi)
+; AVX-NEXT:    retq
+  %res = shufflevector <16 x i4> %arg, <16 x i4> poison,
+  <64 x i32> <i32 0     , i32 poison, i32 1     , i32 poison, i32 2     , i32 poison, i32 3     , i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret <64 x i4> %res
+}
+
+define <128 x i4> @avir_v4i4_to_v128i4(<4 x i4> %arg) {
+; AVX1-LABEL: avir_v4i4_to_v128i4:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movq %rdi, %rax
+; AVX1-NEXT:    vpextrb $8, %xmm0, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    movq %rdx, %rcx
+; AVX1-NEXT:    shlq $32, %rcx
+; AVX1-NEXT:    vmovd %xmm0, %esi
+; AVX1-NEXT:    andl $15, %esi
+; AVX1-NEXT:    vpextrb $1, %xmm0, %edi
+; AVX1-NEXT:    andl $15, %edi
+; AVX1-NEXT:    shll $4, %edi
+; AVX1-NEXT:    orl %esi, %edi
+; AVX1-NEXT:    vpextrb $2, %xmm0, %esi
+; AVX1-NEXT:    andl $15, %esi
+; AVX1-NEXT:    shll $8, %esi
+; AVX1-NEXT:    orl %edi, %esi
+; AVX1-NEXT:    vpextrb $3, %xmm0, %edi
+; AVX1-NEXT:    andl $15, %edi
+; AVX1-NEXT:    shll $12, %edi
+; AVX1-NEXT:    orl %esi, %edi
+; AVX1-NEXT:    shll $16, %edx
+; AVX1-NEXT:    orl %edi, %edx
+; AVX1-NEXT:    vpextrb $9, %xmm0, %edi
+; AVX1-NEXT:    andl $15, %edi
+; AVX1-NEXT:    movl %edi, %r8d
+; AVX1-NEXT:    shll $20, %r8d
+; AVX1-NEXT:    orl %edx, %r8d
+; AVX1-NEXT:    vpextrb $12, %xmm0, %esi
+; AVX1-NEXT:    andl $15, %esi
+; AVX1-NEXT:    movl %esi, %r9d
+; AVX1-NEXT:    shll $24, %r9d
+; AVX1-NEXT:    vpextrb $13, %xmm0, %edx
+; AVX1-NEXT:    movl %edx, %r10d
+; AVX1-NEXT:    shll $28, %r10d
+; AVX1-NEXT:    orl %r9d, %r10d
+; AVX1-NEXT:    orl %r8d, %r10d
+; AVX1-NEXT:    orq %rcx, %r10
+; AVX1-NEXT:    shlq $36, %rdi
+; AVX1-NEXT:    orq %r10, %rdi
+; AVX1-NEXT:    movq %rsi, %rcx
+; AVX1-NEXT:    shlq $40, %rcx
+; AVX1-NEXT:    orq %rdi, %rcx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    movq %rdx, %rdi
+; AVX1-NEXT:    shlq $44, %rdi
+; AVX1-NEXT:    orq %rcx, %rdi
+; AVX1-NEXT:    shlq $48, %rsi
+; AVX1-NEXT:    shlq $52, %rdx
+; AVX1-NEXT:    orq %rsi, %rdx
+; AVX1-NEXT:    vpextrb $14, %xmm0, %ecx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    shlq $56, %rcx
+; AVX1-NEXT:    orq %rdx, %rcx
+; AVX1-NEXT:    vpextrb $15, %xmm0, %edx
+; AVX1-NEXT:    shlq $60, %rdx
+; AVX1-NEXT:    orq %rcx, %rdx
+; AVX1-NEXT:    orq %rdi, %rdx
+; AVX1-NEXT:    movq %rdx, (%rax)
+; AVX1-NEXT:    movq $0, 8(%rax)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: avir_v4i4_to_v128i4:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,u,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrb $8, %xmm1, %ecx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    shlq $32, %rcx
+; AVX2-NEXT:    vmovd %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpextrb $1, %xmm1, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shll $4, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $2, %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shll $8, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $3, %xmm1, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shll $12, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $4, %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shll $16, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $5, %xmm1, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shll $20, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $6, %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shll $24, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $7, %xmm1, %esi
+; AVX2-NEXT:    shll $28, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    orq %rcx, %rsi
+; AVX2-NEXT:    vpextrb $9, %xmm1, %ecx
+; AVX2-NEXT:    movzwl %cx, %ecx
+; AVX2-NEXT:    shlq $36, %rcx
+; AVX2-NEXT:    orq %rsi, %rcx
+; AVX2-NEXT:    vpextrb $10, %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shlq $40, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vpextrb $11, %xmm1, %ecx
+; AVX2-NEXT:    movzwl %cx, %ecx
+; AVX2-NEXT:    shlq $44, %rcx
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    vpextrb $12, %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shlq $48, %rdx
+; AVX2-NEXT:    vpextrb $13, %xmm1, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shlq $52, %rsi
+; AVX2-NEXT:    orq %rdx, %rsi
+; AVX2-NEXT:    vpextrb $14, %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shlq $56, %rdx
+; AVX2-NEXT:    orq %rsi, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vpextrb $15, %xmm1, %ecx
+; AVX2-NEXT:    shlq $60, %rcx
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    movq %rcx, 8(%rdi)
+; AVX2-NEXT:    vpextrb $8, %xmm0, %ecx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    shlq $32, %rcx
+; AVX2-NEXT:    vmovd %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpextrb $1, %xmm0, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shll $4, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $2, %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shll $8, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $3, %xmm0, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shll $12, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $4, %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shll $16, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $5, %xmm0, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shll $20, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $6, %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shll $24, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $7, %xmm0, %esi
+; AVX2-NEXT:    shll $28, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    orq %rcx, %rsi
+; AVX2-NEXT:    vpextrb $9, %xmm0, %ecx
+; AVX2-NEXT:    movzwl %cx, %ecx
+; AVX2-NEXT:    shlq $36, %rcx
+; AVX2-NEXT:    orq %rsi, %rcx
+; AVX2-NEXT:    vpextrb $10, %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shlq $40, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vpextrb $11, %xmm0, %ecx
+; AVX2-NEXT:    movzwl %cx, %ecx
+; AVX2-NEXT:    shlq $44, %rcx
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    vpextrb $12, %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shlq $48, %rdx
+; AVX2-NEXT:    vpextrb $13, %xmm0, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shlq $52, %rsi
+; AVX2-NEXT:    orq %rdx, %rsi
+; AVX2-NEXT:    vpextrb $14, %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shlq $56, %rdx
+; AVX2-NEXT:    orq %rsi, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vpextrb $15, %xmm0, %ecx
+; AVX2-NEXT:    shlq $60, %rcx
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    movq %rcx, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: avir_v4i4_to_v128i4:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq %rdi, %rax
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX512-NEXT:    retq
+  %res = shufflevector <4 x i4> %arg, <4 x i4> poison,
+  <128 x i32> <i32 0     , i32 poison, i32 poison, i32 poison, i32 2     , i32 poison, i32 3     , i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret <128 x i4> %res
+}
+
+define <128 x i4> @avir_v8i4_to_v128i4(<8 x i4> %arg) {
+; AVX1-LABEL: avir_v8i4_to_v128i4:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movq %rdi, %rax
+; AVX1-NEXT:    vpextrb $8, %xmm0, %ecx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    shlq $32, %rcx
+; AVX1-NEXT:    vmovd %xmm0, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpextrb $1, %xmm0, %esi
+; AVX1-NEXT:    andl $15, %esi
+; AVX1-NEXT:    shll $4, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $2, %xmm0, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    shll $8, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $3, %xmm0, %esi
+; AVX1-NEXT:    andl $15, %esi
+; AVX1-NEXT:    shll $12, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $4, %xmm0, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    shll $16, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $5, %xmm0, %esi
+; AVX1-NEXT:    andl $15, %esi
+; AVX1-NEXT:    shll $20, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $6, %xmm0, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    shll $24, %edx
+; AVX1-NEXT:    vpextrb $7, %xmm0, %edi
+; AVX1-NEXT:    shll $28, %edi
+; AVX1-NEXT:    orl %edx, %edi
+; AVX1-NEXT:    orl %esi, %edi
+; AVX1-NEXT:    orq %rcx, %rdi
+; AVX1-NEXT:    vpextrb $9, %xmm0, %ecx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    shlq $36, %rcx
+; AVX1-NEXT:    orq %rdi, %rcx
+; AVX1-NEXT:    vpextrb $10, %xmm0, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    shlq $40, %rdx
+; AVX1-NEXT:    orq %rcx, %rdx
+; AVX1-NEXT:    vpextrb $11, %xmm0, %ecx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    shlq $44, %rcx
+; AVX1-NEXT:    orq %rdx, %rcx
+; AVX1-NEXT:    vpextrb $12, %xmm0, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    shlq $48, %rdx
+; AVX1-NEXT:    vpextrb $13, %xmm0, %esi
+; AVX1-NEXT:    andl $15, %esi
+; AVX1-NEXT:    shlq $52, %rsi
+; AVX1-NEXT:    orq %rdx, %rsi
+; AVX1-NEXT:    vpextrb $14, %xmm0, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    shlq $56, %rdx
+; AVX1-NEXT:    orq %rsi, %rdx
+; AVX1-NEXT:    vpextrb $15, %xmm0, %esi
+; AVX1-NEXT:    shlq $60, %rsi
+; AVX1-NEXT:    orq %rdx, %rsi
+; AVX1-NEXT:    orq %rcx, %rsi
+; AVX1-NEXT:    movq %rsi, (%rax)
+; AVX1-NEXT:    movq $0, 8(%rax)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: avir_v8i4_to_v128i4:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,u,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrb $8, %xmm1, %ecx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    shlq $32, %rcx
+; AVX2-NEXT:    vmovd %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpextrb $1, %xmm1, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shll $4, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $2, %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shll $8, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $3, %xmm1, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shll $12, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $4, %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shll $16, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $5, %xmm1, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shll $20, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $6, %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shll $24, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $7, %xmm1, %esi
+; AVX2-NEXT:    shll $28, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    orq %rcx, %rsi
+; AVX2-NEXT:    vpextrb $9, %xmm1, %ecx
+; AVX2-NEXT:    movzwl %cx, %ecx
+; AVX2-NEXT:    shlq $36, %rcx
+; AVX2-NEXT:    orq %rsi, %rcx
+; AVX2-NEXT:    vpextrb $10, %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shlq $40, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vpextrb $11, %xmm1, %ecx
+; AVX2-NEXT:    movzwl %cx, %ecx
+; AVX2-NEXT:    shlq $44, %rcx
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    vpextrb $12, %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shlq $48, %rdx
+; AVX2-NEXT:    vpextrb $13, %xmm1, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shlq $52, %rsi
+; AVX2-NEXT:    orq %rdx, %rsi
+; AVX2-NEXT:    vpextrb $14, %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shlq $56, %rdx
+; AVX2-NEXT:    orq %rsi, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vpextrb $15, %xmm1, %ecx
+; AVX2-NEXT:    shlq $60, %rcx
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    movq %rcx, 8(%rdi)
+; AVX2-NEXT:    vpextrb $8, %xmm0, %ecx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    shlq $32, %rcx
+; AVX2-NEXT:    vmovd %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpextrb $1, %xmm0, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shll $4, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $2, %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shll $8, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $3, %xmm0, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shll $12, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $4, %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shll $16, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $5, %xmm0, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shll $20, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $6, %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shll $24, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $7, %xmm0, %esi
+; AVX2-NEXT:    shll $28, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    orq %rcx, %rsi
+; AVX2-NEXT:    vpextrb $9, %xmm0, %ecx
+; AVX2-NEXT:    movzwl %cx, %ecx
+; AVX2-NEXT:    shlq $36, %rcx
+; AVX2-NEXT:    orq %rsi, %rcx
+; AVX2-NEXT:    vpextrb $10, %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shlq $40, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vpextrb $11, %xmm0, %ecx
+; AVX2-NEXT:    movzwl %cx, %ecx
+; AVX2-NEXT:    shlq $44, %rcx
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    vpextrb $12, %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shlq $48, %rdx
+; AVX2-NEXT:    vpextrb $13, %xmm0, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shlq $52, %rsi
+; AVX2-NEXT:    orq %rdx, %rsi
+; AVX2-NEXT:    vpextrb $14, %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shlq $56, %rdx
+; AVX2-NEXT:    orq %rsi, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vpextrb $15, %xmm0, %ecx
+; AVX2-NEXT:    shlq $60, %rcx
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    movq %rcx, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: avir_v8i4_to_v128i4:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq %rdi, %rax
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX512-NEXT:    retq
+  %res = shufflevector <8 x i4> %arg, <8 x i4> poison,
+  <128 x i32> <i32 0     , i32 poison, i32 poison, i32 poison, i32 2     , i32 poison, i32 3     , i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret <128 x i4> %res
+}
+
+define <128 x i4> @avir_v16i4_to_v128i4(<16 x i4> %arg) {
+; AVX1-LABEL: avir_v16i4_to_v128i4:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movq %rdi, %rax
+; AVX1-NEXT:    vpextrb $12, %xmm0, %ecx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    shlq $32, %rcx
+; AVX1-NEXT:    vpextrd $2, %xmm0, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpextrb $9, %xmm0, %esi
+; AVX1-NEXT:    andl $15, %esi
+; AVX1-NEXT:    shll $8, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $10, %xmm0, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    shll $16, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $11, %xmm0, %esi
+; AVX1-NEXT:    andl $15, %esi
+; AVX1-NEXT:    shll $24, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    orq %rcx, %rsi
+; AVX1-NEXT:    vpextrb $13, %xmm0, %ecx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    shlq $40, %rcx
+; AVX1-NEXT:    orq %rsi, %rcx
+; AVX1-NEXT:    vpextrb $14, %xmm0, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    shlq $48, %rdx
+; AVX1-NEXT:    orq %rcx, %rdx
+; AVX1-NEXT:    vpextrb $15, %xmm0, %ecx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    shlq $56, %rcx
+; AVX1-NEXT:    orq %rdx, %rcx
+; AVX1-NEXT:    movq %rcx, 8(%rdi)
+; AVX1-NEXT:    vpextrb $4, %xmm0, %ecx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    shlq $32, %rcx
+; AVX1-NEXT:    vmovd %xmm0, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpextrb $1, %xmm0, %esi
+; AVX1-NEXT:    andl $15, %esi
+; AVX1-NEXT:    shll $8, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $2, %xmm0, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    shll $16, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $3, %xmm0, %esi
+; AVX1-NEXT:    andl $15, %esi
+; AVX1-NEXT:    shll $24, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    orq %rcx, %rsi
+; AVX1-NEXT:    vpextrb $5, %xmm0, %ecx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    shlq $40, %rcx
+; AVX1-NEXT:    orq %rsi, %rcx
+; AVX1-NEXT:    vpextrb $6, %xmm0, %edx
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    shlq $48, %rdx
+; AVX1-NEXT:    orq %rcx, %rdx
+; AVX1-NEXT:    vpextrb $7, %xmm0, %ecx
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    shlq $56, %rcx
+; AVX1-NEXT:    orq %rdx, %rcx
+; AVX1-NEXT:    movq %rcx, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: avir_v16i4_to_v128i4:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrb $8, %xmm1, %ecx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    shlq $32, %rcx
+; AVX2-NEXT:    vmovd %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpextrb $1, %xmm1, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shll $4, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $2, %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shll $8, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $3, %xmm1, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shll $12, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $4, %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shll $16, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $5, %xmm1, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shll $20, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $6, %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shll $24, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $7, %xmm1, %esi
+; AVX2-NEXT:    shll $28, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    orq %rcx, %rsi
+; AVX2-NEXT:    vpextrb $9, %xmm1, %ecx
+; AVX2-NEXT:    movzwl %cx, %ecx
+; AVX2-NEXT:    shlq $36, %rcx
+; AVX2-NEXT:    orq %rsi, %rcx
+; AVX2-NEXT:    vpextrb $10, %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shlq $40, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vpextrb $11, %xmm1, %ecx
+; AVX2-NEXT:    movzwl %cx, %ecx
+; AVX2-NEXT:    shlq $44, %rcx
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    vpextrb $12, %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shlq $48, %rdx
+; AVX2-NEXT:    vpextrb $13, %xmm1, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shlq $52, %rsi
+; AVX2-NEXT:    orq %rdx, %rsi
+; AVX2-NEXT:    vpextrb $14, %xmm1, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shlq $56, %rdx
+; AVX2-NEXT:    orq %rsi, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vpextrb $15, %xmm1, %ecx
+; AVX2-NEXT:    shlq $60, %rcx
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    movq %rcx, 8(%rdi)
+; AVX2-NEXT:    vpextrb $8, %xmm0, %ecx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    shlq $32, %rcx
+; AVX2-NEXT:    vmovd %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpextrb $1, %xmm0, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shll $4, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $2, %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shll $8, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $3, %xmm0, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shll $12, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $4, %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shll $16, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $5, %xmm0, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shll $20, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $6, %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shll $24, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $7, %xmm0, %esi
+; AVX2-NEXT:    shll $28, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    orq %rcx, %rsi
+; AVX2-NEXT:    vpextrb $9, %xmm0, %ecx
+; AVX2-NEXT:    movzwl %cx, %ecx
+; AVX2-NEXT:    shlq $36, %rcx
+; AVX2-NEXT:    orq %rsi, %rcx
+; AVX2-NEXT:    vpextrb $10, %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shlq $40, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vpextrb $11, %xmm0, %ecx
+; AVX2-NEXT:    movzwl %cx, %ecx
+; AVX2-NEXT:    shlq $44, %rcx
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    vpextrb $12, %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shlq $48, %rdx
+; AVX2-NEXT:    vpextrb $13, %xmm0, %esi
+; AVX2-NEXT:    movzwl %si, %esi
+; AVX2-NEXT:    shlq $52, %rsi
+; AVX2-NEXT:    orq %rdx, %rsi
+; AVX2-NEXT:    vpextrb $14, %xmm0, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    shlq $56, %rdx
+; AVX2-NEXT:    orq %rsi, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vpextrb $15, %xmm0, %ecx
+; AVX2-NEXT:    shlq $60, %rcx
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    movq %rcx, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: avir_v16i4_to_v128i4:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq %rdi, %rax
+; AVX512-NEXT:    vmovaps %xmm0, (%rdi)
+; AVX512-NEXT:    retq
+  %res = shufflevector <16 x i4> %arg, <16 x i4> poison,
+  <128 x i32> <i32 0     , i32 poison, i32 poison, i32 poison, i32 2     , i32 poison, i32 3     , i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+              i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret <128 x i4> %res
+}

>From 50f76f0fd20c5a3009cc5ccfe3324633e00201a9 Mon Sep 17 00:00:00 2001
From: Abhishek Kaushik <abhishek.kaushik at intel.com>
Date: Wed, 28 Jan 2026 13:49:07 +0530
Subject: [PATCH 2/2] Use `nounwind` and `getExtractSubVector`

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 3 +--
 llvm/test/CodeGen/AArch64/pr161013.ll                  | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 58e3e74d1a28d..5b32c5f945a75 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2293,8 +2293,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND_VECTOR_INREG(SDNode *N) {
   EVT OpVT = Op.getValueType();
   EVT NewVT = EVT::getVectorVT(*DAG.getContext(), OpVT.getScalarType(),
                                ResVT.getVectorNumElements());
-  Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Op), NewVT, Op,
-                   DAG.getVectorIdxConstant(0, SDLoc(Op)));
+  Op = DAG.getExtractSubvector(SDLoc(Op), NewVT, Op, 0);
   return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), ResVT, Op);
 }
 
diff --git a/llvm/test/CodeGen/AArch64/pr161013.ll b/llvm/test/CodeGen/AArch64/pr161013.ll
index 5dd4df61aceb2..d163914f1ac0e 100644
--- a/llvm/test/CodeGen/AArch64/pr161013.ll
+++ b/llvm/test/CodeGen/AArch64/pr161013.ll
@@ -1,11 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=aarch64-- < %s | FileCheck %s
 
-define <16 x i4> @avir_v2i4_v16i4(<2 x i4> %arg) {
+define <16 x i4> @avir_v2i4_v16i4(<2 x i4> %arg) nounwind {
 ; CHECK-LABEL: avir_v2i4_v16i4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
 ; CHECK-NEXT:    str d0, [sp, #8]
 ; CHECK-NEXT:    ldr x8, [sp, #8]



More information about the llvm-commits mailing list