Commit 936546dc authored by Heikki Linnakangas's avatar Heikki Linnakangas

Optimize pg_comp_crc32c_sse42 routine slightly, and also use it on x86.

Eliminate the separate 'len' variable from the loops, and also use the 4
byte instruction. This shaves off a few more cycles. Even though this
routine that uses the special SSE 4.2 instructions is much faster than a
generic routine, it's still a hot spot, so let's make it as fast as
possible.

Change the configure test to not test _mm_crc32_u64. That variant is only
available in the 64-bit x86-64 architecture, not in 32-bit x86. Modify
pg_comp_crc32c_sse42 so that it only uses _mm_crc32_u64 on x86-64. With
these changes, the SSE accelerated CRC-32C implementation can also be used
on 32-bit x86 systems.

This also fixes the 32-bit MSVC build.
parent b73e7a07
...@@ -476,12 +476,16 @@ fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS ...@@ -476,12 +476,16 @@ fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
# PGAC_SSE42_CRC32_INTRINSICS # PGAC_SSE42_CRC32_INTRINSICS
# ----------------------- # -----------------------
# Check if the compiler supports _mm_crc32_u8 and _mm_crc32_u64 intrinsics. # Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
# using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
# test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
# the other ones are, on x86-64 platforms)
#
# An optional compiler flag can be passed as argument (e.g. -msse4.2). If the # An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42. # intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS], AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl [define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=$1], [Ac_cachevar], AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
[pgac_save_CFLAGS=$CFLAGS [pgac_save_CFLAGS=$CFLAGS
CFLAGS="$pgac_save_CFLAGS $1" CFLAGS="$pgac_save_CFLAGS $1"
ac_save_c_werror_flag=$ac_c_werror_flag ac_save_c_werror_flag=$ac_c_werror_flag
...@@ -489,7 +493,7 @@ ac_c_werror_flag=yes ...@@ -489,7 +493,7 @@ ac_c_werror_flag=yes
AC_TRY_LINK([#include <nmmintrin.h>], AC_TRY_LINK([#include <nmmintrin.h>],
[unsigned int crc = 0; [unsigned int crc = 0;
crc = _mm_crc32_u8(crc, 0); crc = _mm_crc32_u8(crc, 0);
crc = (unsigned int) _mm_crc32_u64(crc, 0);], crc = _mm_crc32_u32(crc, 0);],
[Ac_cachevar=yes], [Ac_cachevar=yes],
[Ac_cachevar=no]) [Ac_cachevar=no])
ac_c_werror_flag=$ac_save_c_werror_flag ac_c_werror_flag=$ac_save_c_werror_flag
......
...@@ -14172,8 +14172,8 @@ fi ...@@ -14172,8 +14172,8 @@ fi
# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
# with the default compiler flags. If not, check if adding the -msse4.2 # with the default compiler flags. If not, check if adding the -msse4.2
# flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required. # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=" >&5 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=... " >&6; } $as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then : if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
$as_echo_n "(cached) " >&6 $as_echo_n "(cached) " >&6
else else
...@@ -14189,7 +14189,7 @@ main () ...@@ -14189,7 +14189,7 @@ main ()
{ {
unsigned int crc = 0; unsigned int crc = 0;
crc = _mm_crc32_u8(crc, 0); crc = _mm_crc32_u8(crc, 0);
crc = (unsigned int) _mm_crc32_u64(crc, 0); crc = _mm_crc32_u32(crc, 0);
; ;
return 0; return 0;
} }
...@@ -14212,8 +14212,8 @@ if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then ...@@ -14212,8 +14212,8 @@ if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
fi fi
if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=-msse4.2" >&5 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=-msse4.2... " >&6; } $as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then : if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
$as_echo_n "(cached) " >&6 $as_echo_n "(cached) " >&6
else else
...@@ -14229,7 +14229,7 @@ main () ...@@ -14229,7 +14229,7 @@ main ()
{ {
unsigned int crc = 0; unsigned int crc = 0;
crc = _mm_crc32_u8(crc, 0); crc = _mm_crc32_u8(crc, 0);
crc = (unsigned int) _mm_crc32_u64(crc, 0); crc = _mm_crc32_u32(crc, 0);
; ;
return 0; return 0;
} }
......
...@@ -22,30 +22,45 @@ pg_crc32c ...@@ -22,30 +22,45 @@ pg_crc32c
pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
{ {
const unsigned char *p = data; const unsigned char *p = data;
const uint64 *p8; const unsigned char *pend = p + len;
/* /*
* Process eight bytes of data at a time. * Process eight bytes of data at a time.
* *
* NB: We do unaligned 8-byte accesses here. The Intel architecture * NB: We do unaligned accesses here. The Intel architecture allows that,
* allows that, and performance testing didn't show any performance * and performance testing didn't show any performance gain from aligning
* gain from aligning the beginning address. * the begin address.
*/ */
p8 = (const uint64 *) p; #ifdef __x86_64__
while (len >= 8) while (p + 8 <= pend)
{ {
crc = (uint32) _mm_crc32_u64(crc, *p8++); crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
len -= 8; p += 8;
} }
/* Process remaining full four bytes if any */
if (p + 4 <= pend)
{
crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
p += 4;
}
#else
/* /*
* Handle any remaining bytes one at a time. * Process four bytes at a time. (The eight byte instruction is not
* available on the 32-bit x86 architecture).
*/ */
p = (const unsigned char *) p8; while (p + 4 <= pend)
while (len > 0) {
crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
p += 4;
}
#endif /* __x86_64__ */
/* Process any remaining bytes one at a time. */
while (p < pend)
{ {
crc = _mm_crc32_u8(crc, *p++); crc = _mm_crc32_u8(crc, *p);
len--; p++;
} }
return crc; return crc;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment