[llvm] [X86]Support lowering for APX Promoted SHA/MOVDIR/CRC32/INVPCID instructions (PR #76786)

Tue Jan 2 23:21:55 PST 2024

https://github.com/XinWang10 updated https://github.com/llvm/llvm-project/pull/76786

>From c0c6ab1f2a91fa96a119eac8e1bb3bf0deb8dfdc Mon Sep 17 00:00:00 2001
From: "Wang, Xin10" <xin10.wang at intel.com>
Date: Tue, 2 Jan 2024 23:10:13 -0800
Subject: [PATCH 1/2] Support Lowering for APX Promoted
 SHA/MOVDIR/CRC32/INVPCID instructions

---
 llvm/lib/Target/X86/X86FastISel.cpp          |   8 +-
 llvm/lib/Target/X86/X86InstrSystem.td        |  13 +-
 llvm/lib/Target/X86/X86InstrVMX.td           |   8 +-
 llvm/test/CodeGen/X86/apx/cet.ll             |  50 +++++
 llvm/test/CodeGen/X86/apx/crc32-fast-isel.ll |  61 ++++++
 llvm/test/CodeGen/X86/apx/crc32.ll           |  58 ++++++
 llvm/test/CodeGen/X86/apx/invpcid.ll         |  27 +++
 llvm/test/CodeGen/X86/apx/movdir.ll          |  38 ++++
 llvm/test/CodeGen/X86/apx/sha.ll             | 186 +++++++++++++++++++
 9 files changed, 438 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/apx/cet.ll
 create mode 100644 llvm/test/CodeGen/X86/apx/crc32-fast-isel.ll
 create mode 100644 llvm/test/CodeGen/X86/apx/crc32.ll
 create mode 100644 llvm/test/CodeGen/X86/apx/invpcid.ll
 create mode 100644 llvm/test/CodeGen/X86/apx/movdir.ll
 create mode 100644 llvm/test/CodeGen/X86/apx/sha.ll

diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 0ba31e173a1a72..3658af785c24ec 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -3047,19 +3047,19 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     default:
       llvm_unreachable("Unexpected intrinsic.");
     case Intrinsic::x86_sse42_crc32_32_8:
-      Opc = X86::CRC32r32r8;
+      Opc = Subtarget->hasCRC32() ? X86::CRC32r32r8_EVEX : X86::CRC32r32r8;
       RC = &X86::GR32RegClass;
       break;
     case Intrinsic::x86_sse42_crc32_32_16:
-      Opc = X86::CRC32r32r16;
+      Opc = Subtarget->hasCRC32() ? X86::CRC32r32r16_EVEX : X86::CRC32r32r16;
       RC = &X86::GR32RegClass;
       break;
     case Intrinsic::x86_sse42_crc32_32_32:
-      Opc = X86::CRC32r32r32;
+      Opc = Subtarget->hasCRC32() ? X86::CRC32r32r32_EVEX : X86::CRC32r32r32;
       RC = &X86::GR32RegClass;
       break;
     case Intrinsic::x86_sse42_crc32_64_64:
-      Opc = X86::CRC32r64r64;
+      Opc = Subtarget->hasCRC32() ? X86::CRC32r64r64_EVEX : X86::CRC32r64r64;
       RC = &X86::GR64RegClass;
       break;
     }
diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td
index 699e5847e63fb9..30530a00809f3f 100644
--- a/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/llvm/lib/Target/X86/X86InstrSystem.td
@@ -695,14 +695,14 @@ def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
                   Requires<[Not64BitMode, HasINVPCID]>;
 def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
                   "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8, PD,
-                  Requires<[In64BitMode, HasINVPCID]>;
+                  Requires<[In64BitMode, HasINVPCID, NoEGPR]>;
 
 def INVPCID64_EVEX : I<0xF2, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
                        "invpcid\t{$src2, $src1|$src1, $src2}", []>,
-                     EVEX, NoCD8, T_MAP4, XS, Requires<[In64BitMode, HasINVPCID]>;
+                     EVEX, NoCD8, T_MAP4, XS, Requires<[In64BitMode, HasINVPCID, HasEGPR]>;
 } // SchedRW
 
-let Predicates = [In64BitMode, HasINVPCID] in {
+let Predicates = [In64BitMode, HasINVPCID, NoEGPR] in {
   // The instruction can only use a 64 bit register as the register argument
   // in 64 bit mode, while the intrinsic only accepts a 32 bit argument
   // corresponding to it.
@@ -714,6 +714,13 @@ let Predicates = [In64BitMode, HasINVPCID] in {
               addr:$src2)>;
 }
 
+let Predicates = [In64BitMode, HasINVPCID, HasEGPR] in {
+  def : Pat<(int_x86_invpcid GR32:$src1, addr:$src2),
+            (INVPCID64_EVEX
+              (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src1), sub_32bit),
+              addr:$src2)>;
+}
+
 
 //===----------------------------------------------------------------------===//
 // SMAP Instruction
diff --git a/llvm/lib/Target/X86/X86InstrVMX.td b/llvm/lib/Target/X86/X86InstrVMX.td
index 7cc468fe15ad4e..e6722467897216 100644
--- a/llvm/lib/Target/X86/X86InstrVMX.td
+++ b/llvm/lib/Target/X86/X86InstrVMX.td
@@ -21,10 +21,10 @@ def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
                Requires<[Not64BitMode]>;
 def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
                "invept\t{$src2, $src1|$src1, $src2}", []>, T8, PD,
-               Requires<[In64BitMode]>;
+               Requires<[In64BitMode, NoEGPR]>;
 def INVEPT64_EVEX : I<0xF0, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
                       "invept\t{$src2, $src1|$src1, $src2}", []>,
-                    EVEX, NoCD8, T_MAP4, XS, Requires<[In64BitMode]>;
+                    EVEX, NoCD8, T_MAP4, XS, Requires<[In64BitMode, HasEGPR]>;
 
 // 66 0F 38 81
 def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
@@ -32,10 +32,10 @@ def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
                 Requires<[Not64BitMode]>;
 def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
                 "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8, PD,
-                Requires<[In64BitMode]>;
+                Requires<[In64BitMode, NoEGPR]>;
 def INVVPID64_EVEX : I<0xF1, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
                        "invvpid\t{$src2, $src1|$src1, $src2}", []>,
-                     EVEX, NoCD8, T_MAP4, XS, Requires<[In64BitMode]>;
+                     EVEX, NoCD8, T_MAP4, XS, Requires<[In64BitMode, HasEGPR]>;
 
 // 0F 01 C1
 def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
diff --git a/llvm/test/CodeGen/X86/apx/cet.ll b/llvm/test/CodeGen/X86/apx/cet.ll
new file mode 100644
index 00000000000000..98f3844d1ccd19
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/cet.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+shstk,+egpr --show-mc-encoding | FileCheck %s
+
+define void @test_wrssd(i32 %a, ptr %__p) {
+; CHECK-LABEL: test_wrssd:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    wrssd %edi, (%rsi) ## encoding: [0x62,0xf4,0x7c,0x08,0x66,0x3e]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+entry:
+  tail call void @llvm.x86.wrssd(i32 %a, ptr %__p)
+  ret void
+}
+
+declare void @llvm.x86.wrssd(i32, ptr)
+
+define void @test_wrssq(i64 %a, ptr %__p) {
+; CHECK-LABEL: test_wrssq:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    wrssq %rdi, (%rsi) ## encoding: [0x62,0xf4,0xfc,0x08,0x66,0x3e]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+entry:
+  tail call void @llvm.x86.wrssq(i64 %a, ptr %__p)
+  ret void
+}
+
+declare void @llvm.x86.wrssq(i64, ptr)
+
+define void @test_wrussd(i32 %a, ptr %__p) {
+; CHECK-LABEL: test_wrussd:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    wrussd %edi, (%rsi) ## encoding: [0x62,0xf4,0x7d,0x08,0x65,0x3e]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+entry:
+  tail call void @llvm.x86.wrussd(i32 %a, ptr %__p)
+  ret void
+}
+
+declare void @llvm.x86.wrussd(i32, ptr)
+
+define void @test_wrussq(i64 %a, ptr %__p) {
+; CHECK-LABEL: test_wrussq:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    wrussq %rdi, (%rsi) ## encoding: [0x62,0xf4,0xfd,0x08,0x65,0x3e]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+entry:
+  tail call void @llvm.x86.wrussq(i64 %a, ptr %__p)
+  ret void
+}
+
+declare void @llvm.x86.wrussq(i64, ptr)
diff --git a/llvm/test/CodeGen/X86/apx/crc32-fast-isel.ll b/llvm/test/CodeGen/X86/apx/crc32-fast-isel.ll
new file mode 100644
index 00000000000000..0b51679ccd7fb5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/crc32-fast-isel.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -fast-isel-abort=3 -mtriple=x86_64-unknown-unknown -mattr=-sse4.2,+crc32,+egpr --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -fast-isel -fast-isel-abort=3 -mtriple=x86_64-unknown-unknown -mattr=+crc32,+egpr --show-mc-encoding | FileCheck %s
+
+define i32 @test_mm_crc32_u8(i32 %a0, i32 %a1) nounwind {
+; CHECK-LABEL: test_mm_crc32_u8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax # encoding: [0x89,0xf8]
+; CHECK-NEXT:    crc32b %sil, %eax # encoding: [0x62,0xf4,0x7c,0x08,0xf0,0xc6]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %trunc = trunc i32 %a1 to i8
+  %res = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %trunc)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind readnone
+
+define i32 @test_mm_crc32_u16(i32 %a0, i32 %a1) nounwind {
+; CHECK-LABEL: test_mm_crc32_u16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax # encoding: [0x89,0xf8]
+; CHECK-NEXT:    crc32w %si, %eax # encoding: [0x62,0xf4,0x7d,0x08,0xf1,0xc6]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %trunc = trunc i32 %a1 to i16
+  %res = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %trunc)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind readnone
+
+define i32 @test_mm_crc32_u32(i32 %a0, i32 %a1) nounwind {
+; CHECK-LABEL: test_mm_crc32_u32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax # encoding: [0x89,0xf8]
+; CHECK-NEXT:    crc32l %esi, %eax # encoding: [0x62,0xf4,0x7c,0x08,0xf1,0xc6]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %res = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1)
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind readnone
+
+define i64 @test_mm_crc64_u8(i64 %a0, i32 %a1) nounwind{
+; CHECK-LABEL: test_mm_crc64_u8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    crc32b %sil, %edi # encoding: [0x62,0xf4,0x7c,0x08,0xf0,0xfe]
+; CHECK-NEXT:    movl %edi, %eax # encoding: [0x89,0xf8]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %trunc = trunc i32 %a1 to i8
+  %res = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a0, i8 %trunc)
+  ret i64 %res
+}
+declare i64 @llvm.x86.sse42.crc32.64.8(i64, i8) nounwind readnone
+
+define i64 @test_mm_crc64_u64(i64 %a0, i64 %a1) nounwind{
+; CHECK-LABEL: test_mm_crc64_u64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax # encoding: [0x48,0x89,0xf8]
+; CHECK-NEXT:    crc32q %rsi, %rax # encoding: [0x62,0xf4,0xfc,0x08,0xf1,0xc6]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %res = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1)
+  ret i64 %res
+}
+declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/apx/crc32.ll b/llvm/test/CodeGen/X86/apx/crc32.ll
new file mode 100644
index 00000000000000..4bcc4d15cc6b5a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/crc32.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+crc32,+egpr -show-mc-encoding | FileCheck %s
+
+define i32 @crc32_32_8(i32 %a, i8 %b) nounwind {
+; CHECK-LABEL: crc32_32_8:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
+; CHECK-NEXT:    crc32b %sil, %eax ## encoding: [0x62,0xf4,0x7c,0x08,0xf0,0xc6]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %tmp = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a, i8 %b)
+  ret i32 %tmp
+}
+
+define i32 @crc32_32_16(i32 %a, i16 %b) nounwind {
+; CHECK-LABEL: crc32_32_16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
+; CHECK-NEXT:    crc32w %si, %eax ## encoding: [0x62,0xf4,0x7d,0x08,0xf1,0xc6]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %tmp = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a, i16 %b)
+  ret i32 %tmp
+}
+
+define i32 @crc32_32_32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: crc32_32_32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
+; CHECK-NEXT:    crc32l %esi, %eax ## encoding: [0x62,0xf4,0x7c,0x08,0xf1,0xc6]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %tmp = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a, i32 %b)
+  ret i32 %tmp
+}
+
+define i64 @crc32_64_8(i64 %a, i8 %b) nounwind {
+; CHECK-LABEL: crc32_64_8:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
+; CHECK-NEXT:    crc32b %sil, %eax ## encoding: [0x62,0xf4,0x7c,0x08,0xf0,0xc6]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %tmp = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a, i8 %b)
+  ret i64 %tmp
+}
+
+define i64 @crc32_64_64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: crc32_64_64:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
+; CHECK-NEXT:    crc32q %rsi, %rax ## encoding: [0x62,0xf4,0xfc,0x08,0xf1,0xc6]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %tmp = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a, i64 %b)
+  ret i64 %tmp
+}
+
+declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
+declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
+declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
+declare i64 @llvm.x86.sse42.crc32.64.8(i64, i8) nounwind
+declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind
diff --git a/llvm/test/CodeGen/X86/apx/invpcid.ll b/llvm/test/CodeGen/X86/apx/invpcid.ll
new file mode 100644
index 00000000000000..389895f4921305
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/invpcid.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+invpcid,+egpr --show-mc-encoding | FileCheck %s
+
+define void @test_invpcid(i32 %type, ptr %descriptor) {
+; CHECK-LABEL: test_invpcid:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %edi, %eax # encoding: [0x89,0xf8]
+; CHECK-NEXT:    invpcid (%rsi), %rax # encoding: [0x62,0xf4,0x7e,0x08,0xf2,0x06]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  call void @llvm.x86.invpcid(i32 %type, ptr %descriptor)
+  ret void
+}
+
+define void @test_invpcid2(ptr readonly %type, ptr %descriptor) {
+; CHECK-LABEL: test_invpcid2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; CHECK-NEXT:    invpcid (%rsi), %rax # encoding: [0x62,0xf4,0x7e,0x08,0xf2,0x06]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = load i32, ptr %type, align 4
+  tail call void @llvm.x86.invpcid(i32 %0, ptr %descriptor) #1
+  ret void
+}
+
+declare void @llvm.x86.invpcid(i32, ptr)
diff --git a/llvm/test/CodeGen/X86/apx/movdir.ll b/llvm/test/CodeGen/X86/apx/movdir.ll
new file mode 100644
index 00000000000000..06fd7511bc143c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/movdir.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+movdiri,+movdir64b,+egpr --show-mc-encoding | FileCheck %s
+
+define void @test_movdiri(ptr %p, i32 %v) {
+; CHECK-LABEL: test_movdiri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movdiri %esi, (%rdi) # encoding: [0x62,0xf4,0x7c,0x08,0xf9,0x37]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  call void @llvm.x86.directstore32(ptr %p, i32 %v)
+  ret void
+}
+
+declare void @llvm.x86.directstore32(ptr, i32)
+
+define void @test_movdiri_64(ptr %p, i64 %v) {
+; CHECK-LABEL: test_movdiri_64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movdiri %rsi, (%rdi) # encoding: [0x62,0xf4,0xfc,0x08,0xf9,0x37]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  call void @llvm.x86.directstore64(ptr %p, i64 %v)
+  ret void
+}
+
+declare void @llvm.x86.directstore64(ptr, i64)
+
+define void @test_movdir64b(ptr %dst, ptr %src) {
+; CHECK-LABEL: test_movdir64b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movdir64b (%rsi), %rdi # encoding: [0x62,0xf4,0x7d,0x08,0xf8,0x3e]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  call void @llvm.x86.movdir64b(ptr %dst, ptr %src)
+  ret void
+}
+
+declare void @llvm.x86.movdir64b(ptr, ptr)
diff --git a/llvm/test/CodeGen/X86/apx/sha.ll b/llvm/test/CodeGen/X86/apx/sha.ll
new file mode 100644
index 00000000000000..088ee61a97f4ea
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/sha.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mattr=+sha,+egpr -mtriple=x86_64-unknown-unknown --show-mc-encoding | FileCheck %s
+
+declare <4 x i32> @llvm.x86.sha1rnds4(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <4 x i32> @test_sha1rnds4rr(<4 x i32> %a, <4 x i32> %b) nounwind uwtable {
+; CHECK-LABEL: test_sha1rnds4rr:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sha1rnds4 $3, %xmm1, %xmm0 # encoding: [0x62,0xf4,0x7c,0x08,0xd4,0xc1,0x03]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha1rnds4(<4 x i32> %a, <4 x i32> %b, i8 3)
+  ret <4 x i32> %0
+}
+
+define <4 x i32> @test_sha1rnds4rm(<4 x i32> %a, ptr %b) nounwind uwtable {
+; CHECK-LABEL: test_sha1rnds4rm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sha1rnds4 $3, (%rdi), %xmm0 # encoding: [0x62,0xf4,0x7c,0x08,0xd4,0x07,0x03]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = load <4 x i32>, ptr %b
+  %1 = tail call <4 x i32> @llvm.x86.sha1rnds4(<4 x i32> %a, <4 x i32> %0, i8 3)
+  ret <4 x i32> %1
+}
+
+declare <4 x i32> @llvm.x86.sha1nexte(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_sha1nexterr(<4 x i32> %a, <4 x i32> %b) nounwind uwtable {
+; CHECK-LABEL: test_sha1nexterr:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sha1nexte %xmm1, %xmm0 # encoding: [0x62,0xf4,0x7c,0x08,0xd8,0xc1]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha1nexte(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+}
+
+define <4 x i32> @test_sha1nexterm(<4 x i32> %a, ptr %b) nounwind uwtable {
+; CHECK-LABEL: test_sha1nexterm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sha1nexte (%rdi), %xmm0 # encoding: [0x62,0xf4,0x7c,0x08,0xd8,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = load <4 x i32>, ptr %b
+  %1 = tail call <4 x i32> @llvm.x86.sha1nexte(<4 x i32> %a, <4 x i32> %0)
+  ret <4 x i32> %1
+}
+
+declare <4 x i32> @llvm.x86.sha1msg1(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_sha1msg1rr(<4 x i32> %a, <4 x i32> %b) nounwind uwtable {
+; CHECK-LABEL: test_sha1msg1rr:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sha1msg1 %xmm1, %xmm0 # encoding: [0x62,0xf4,0x7c,0x08,0xd9,0xc1]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha1msg1(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+}
+
+define <4 x i32> @test_sha1msg1rm(<4 x i32> %a, ptr %b) nounwind uwtable {
+; CHECK-LABEL: test_sha1msg1rm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sha1msg1 (%rdi), %xmm0 # encoding: [0x62,0xf4,0x7c,0x08,0xd9,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = load <4 x i32>, ptr %b
+  %1 = tail call <4 x i32> @llvm.x86.sha1msg1(<4 x i32> %a, <4 x i32> %0)
+  ret <4 x i32> %1
+}
+
+declare <4 x i32> @llvm.x86.sha1msg2(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_sha1msg2rr(<4 x i32> %a, <4 x i32> %b) nounwind uwtable {
+; CHECK-LABEL: test_sha1msg2rr:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sha1msg2 %xmm1, %xmm0 # encoding: [0x62,0xf4,0x7c,0x08,0xda,0xc1]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha1msg2(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+}
+
+define <4 x i32> @test_sha1msg2rm(<4 x i32> %a, ptr %b) nounwind uwtable {
+; CHECK-LABEL: test_sha1msg2rm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sha1msg2 (%rdi), %xmm0 # encoding: [0x62,0xf4,0x7c,0x08,0xda,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = load <4 x i32>, ptr %b
+  %1 = tail call <4 x i32> @llvm.x86.sha1msg2(<4 x i32> %a, <4 x i32> %0)
+  ret <4 x i32> %1
+}
+
+declare <4 x i32> @llvm.x86.sha256rnds2(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_sha256rnds2rr(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind uwtable {
+;
+; CHECK-LABEL: test_sha256rnds2rr:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movaps %xmm0, %xmm3 # encoding: [0x0f,0x28,0xd8]
+; CHECK-NEXT:    movaps %xmm2, %xmm0 # encoding: [0x0f,0x28,0xc2]
+; CHECK-NEXT:    sha256rnds2 %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf4,0x7c,0x08,0xdb,0xd9]
+; CHECK-NEXT:    movaps %xmm3, %xmm0 # encoding: [0x0f,0x28,0xc3]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha256rnds2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+  ret <4 x i32> %0
+}
+
+define <4 x i32> @test_sha256rnds2rm(<4 x i32> %a, ptr %b, <4 x i32> %c) nounwind uwtable {
+;
+; CHECK-LABEL: test_sha256rnds2rm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movaps %xmm0, %xmm2 # encoding: [0x0f,0x28,0xd0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
+; CHECK-NEXT:    sha256rnds2 %xmm0, (%rdi), %xmm2 # encoding: [0x62,0xf4,0x7c,0x08,0xdb,0x17]
+; CHECK-NEXT:    movaps %xmm2, %xmm0 # encoding: [0x0f,0x28,0xc2]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = load <4 x i32>, ptr %b
+  %1 = tail call <4 x i32> @llvm.x86.sha256rnds2(<4 x i32> %a, <4 x i32> %0, <4 x i32> %c)
+  ret <4 x i32> %1
+}
+
+declare <4 x i32> @llvm.x86.sha256msg1(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_sha256msg1rr(<4 x i32> %a, <4 x i32> %b) nounwind uwtable {
+; CHECK-LABEL: test_sha256msg1rr:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sha256msg1 %xmm1, %xmm0 # encoding: [0x62,0xf4,0x7c,0x08,0xdc,0xc1]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha256msg1(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+}
+
+define <4 x i32> @test_sha256msg1rm(<4 x i32> %a, ptr %b) nounwind uwtable {
+; CHECK-LABEL: test_sha256msg1rm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sha256msg1 (%rdi), %xmm0 # encoding: [0x62,0xf4,0x7c,0x08,0xdc,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = load <4 x i32>, ptr %b
+  %1 = tail call <4 x i32> @llvm.x86.sha256msg1(<4 x i32> %a, <4 x i32> %0)
+  ret <4 x i32> %1
+}
+
+declare <4 x i32> @llvm.x86.sha256msg2(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_sha256msg2rr(<4 x i32> %a, <4 x i32> %b) nounwind uwtable {
+; CHECK-LABEL: test_sha256msg2rr:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sha256msg2 %xmm1, %xmm0 # encoding: [0x62,0xf4,0x7c,0x08,0xdd,0xc1]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha256msg2(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+}
+
+define <4 x i32> @test_sha256msg2rm(<4 x i32> %a, ptr %b) nounwind uwtable {
+; CHECK-LABEL: test_sha256msg2rm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sha256msg2 (%rdi), %xmm0 # encoding: [0x62,0xf4,0x7c,0x08,0xdd,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = load <4 x i32>, ptr %b
+  %1 = tail call <4 x i32> @llvm.x86.sha256msg2(<4 x i32> %a, <4 x i32> %0)
+  ret <4 x i32> %1
+}
+
+; Make sure we don't forget that sha instructions have no VEX equivalents and thus don't zero YMM/ZMM.
+define <8 x i32> @test_sha1rnds4_zero_extend(<4 x i32> %a, ptr %b) nounwind uwtable {
+;
+; CHECK-LABEL: test_sha1rnds4_zero_extend:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sha1rnds4 $3, (%rdi), %xmm0 # encoding: [0x62,0xf4,0x7c,0x08,0xd4,0x07,0x03]
+; CHECK-NEXT:    xorps %xmm1, %xmm1 # encoding: [0x0f,0x57,0xc9]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = load <4 x i32>, ptr %b
+  %1 = tail call <4 x i32> @llvm.x86.sha1rnds4(<4 x i32> %a, <4 x i32> %0, i8 3)
+  %2 = shufflevector <4 x i32> %1, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %2
+}

>From b6d60b595e3784df18042a858206f28f7e5a1082 Mon Sep 17 00:00:00 2001
From: "Wang, Xin10" <xin10.wang at intel.com>
Date: Tue, 2 Jan 2024 23:21:46 -0800
Subject: [PATCH 2/2] fix error

---
 llvm/lib/Target/X86/X86FastISel.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 3658af785c24ec..8b4ff4c8ed878f 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -3047,19 +3047,19 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     default:
       llvm_unreachable("Unexpected intrinsic.");
     case Intrinsic::x86_sse42_crc32_32_8:
-      Opc = Subtarget->hasCRC32() ? X86::CRC32r32r8_EVEX : X86::CRC32r32r8;
+      Opc = Subtarget->hasEGPR() ? X86::CRC32r32r8_EVEX : X86::CRC32r32r8;
       RC = &X86::GR32RegClass;
       break;
     case Intrinsic::x86_sse42_crc32_32_16:
-      Opc = Subtarget->hasCRC32() ? X86::CRC32r32r16_EVEX : X86::CRC32r32r16;
+      Opc = Subtarget->hasEGPR() ? X86::CRC32r32r16_EVEX : X86::CRC32r32r16;
       RC = &X86::GR32RegClass;
       break;
     case Intrinsic::x86_sse42_crc32_32_32:
-      Opc = Subtarget->hasCRC32() ? X86::CRC32r32r32_EVEX : X86::CRC32r32r32;
+      Opc = Subtarget->hasEGPR() ? X86::CRC32r32r32_EVEX : X86::CRC32r32r32;
       RC = &X86::GR32RegClass;
       break;
     case Intrinsic::x86_sse42_crc32_64_64:
-      Opc = Subtarget->hasCRC32() ? X86::CRC32r64r64_EVEX : X86::CRC32r64r64;
+      Opc = Subtarget->hasEGPR() ? X86::CRC32r64r64_EVEX : X86::CRC32r64r64;
       RC = &X86::GR64RegClass;
       break;
     }