Commit fc6c7274 authored by Alvaro Herrera's avatar Alvaro Herrera

Fix compiler builtin usage in new pg_bitutils.c

Split out these new functions in three parts: one in a new file that
uses the compiler builtin and gets compiled with the -mpopcnt compiler
option if it exists; another one that uses the compiler builtin but not
the compiler option; and finally the fallback with open-coded
algorithms.

Split out the configure logic: in the original commit, it was selecting
to use the -mpopcnt compiler switch together with deciding whether to
use the compiler builtin, but those two things are really separate.
Split them out.  Also, expose whether the builtin exists to
Makefile.global, so that src/port's Makefile can decide whether to
compile the hw-optimized file.

Remove CPUID test for CTZ/CLZ.  Make pg_{right,left}most_ones use either
the compiler intrinsic or open-coded algo; trying to use the
HW-optimized version is a waste of time.  Make them static inline
functions.

Discussion: https://postgr.es/m/20190213221719.GA15976@alvherre.pgsql
parent b060e6c1
...@@ -381,22 +381,16 @@ fi])# PGAC_C_BUILTIN_OP_OVERFLOW ...@@ -381,22 +381,16 @@ fi])# PGAC_C_BUILTIN_OP_OVERFLOW
# PGAC_C_BUILTIN_POPCOUNT # PGAC_C_BUILTIN_POPCOUNT
# ------------------------- # -------------------------
AC_DEFUN([PGAC_C_BUILTIN_POPCOUNT], AC_DEFUN([PGAC_C_BUILTIN_POPCOUNT],
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_popcount])])dnl [AC_CACHE_CHECK([for __builtin_popcount], pgac_cv__builtin_popcount,
AC_CACHE_CHECK([for __builtin_popcount], [Ac_cachevar], [AC_COMPILE_IFELSE([AC_LANG_SOURCE(
[pgac_save_CFLAGS=$CFLAGS [static int x = __builtin_popcount(255);]
CFLAGS="$pgac_save_CFLAGS -mpopcnt" )],
AC_COMPILE_IFELSE([AC_LANG_SOURCE( [pgac_cv__builtin_popcount=yes],
[static int x = __builtin_popcount(255);])], [pgac_cv__builtin_popcount=no])])
[Ac_cachevar=yes], if test x"$pgac_cv__builtin_popcount" = x"yes"; then
[Ac_cachevar=no])
CFLAGS="$pgac_save_CFLAGS"])
if test x"$Ac_cachevar" = x"yes"; then
CFLAGS_POPCNT="-mpopcnt"
AC_DEFINE(HAVE__BUILTIN_POPCOUNT, 1, AC_DEFINE(HAVE__BUILTIN_POPCOUNT, 1,
[Define to 1 if your compiler understands __builtin_popcount.]) [Define to 1 if your compiler understands __builtin_popcount.])
fi fi])# PGAC_C_BUILTIN_POPCOUNT
undefine([Ac_cachevar])dnl
])# PGAC_C_BUILTIN_POPCOUNT
......
...@@ -651,7 +651,7 @@ CFLAGS_ARMV8_CRC32C ...@@ -651,7 +651,7 @@ CFLAGS_ARMV8_CRC32C
CFLAGS_SSE42 CFLAGS_SSE42
have_win32_dbghelp have_win32_dbghelp
LIBOBJS LIBOBJS
CFLAGS_POPCNT have__builtin_popcount
UUID_LIBS UUID_LIBS
LDAP_LIBS_BE LDAP_LIBS_BE
LDAP_LIBS_FE LDAP_LIBS_FE
...@@ -733,6 +733,7 @@ CPP ...@@ -733,6 +733,7 @@ CPP
BITCODE_CXXFLAGS BITCODE_CXXFLAGS
BITCODE_CFLAGS BITCODE_CFLAGS
CFLAGS_VECTOR CFLAGS_VECTOR
CFLAGS_POPCNT
PERMIT_DECLARATION_AFTER_STATEMENT PERMIT_DECLARATION_AFTER_STATEMENT
LLVM_BINPATH LLVM_BINPATH
LLVM_CXXFLAGS LLVM_CXXFLAGS
...@@ -6581,6 +6582,48 @@ fi ...@@ -6581,6 +6582,48 @@ fi
fi fi
# Optimization flags and options for bit-twiddling
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CC} supports -mpopcnt, for CFLAGS_POPCNT" >&5
$as_echo_n "checking whether ${CC} supports -mpopcnt, for CFLAGS_POPCNT... " >&6; }
if ${pgac_cv_prog_CC_cflags__mpopcnt+:} false; then :
$as_echo_n "(cached) " >&6
else
pgac_save_CFLAGS=$CFLAGS
pgac_save_CC=$CC
CC=${CC}
CFLAGS="${CFLAGS_POPCNT} -mpopcnt"
ac_save_c_werror_flag=$ac_c_werror_flag
ac_c_werror_flag=yes
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main ()
{
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
pgac_cv_prog_CC_cflags__mpopcnt=yes
else
pgac_cv_prog_CC_cflags__mpopcnt=no
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
ac_c_werror_flag=$ac_save_c_werror_flag
CFLAGS="$pgac_save_CFLAGS"
CC="$pgac_save_CC"
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CC_cflags__mpopcnt" >&5
$as_echo "$pgac_cv_prog_CC_cflags__mpopcnt" >&6; }
if test x"$pgac_cv_prog_CC_cflags__mpopcnt" = x"yes"; then
CFLAGS_POPCNT="${CFLAGS_POPCNT} -mpopcnt"
fi
CFLAGS_VECTOR=$CFLAGS_VECTOR CFLAGS_VECTOR=$CFLAGS_VECTOR
...@@ -14111,32 +14154,28 @@ $as_echo "#define HAVE__BUILTIN_CTZ 1" >>confdefs.h ...@@ -14111,32 +14154,28 @@ $as_echo "#define HAVE__BUILTIN_CTZ 1" >>confdefs.h
fi fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_popcount" >&5 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_popcount" >&5
$as_echo_n "checking for __builtin_popcount... " >&6; } $as_echo_n "checking for __builtin_popcount... " >&6; }
if ${pgac_cv_popcount+:} false; then : if ${pgac_cv__builtin_popcount+:} false; then :
$as_echo_n "(cached) " >&6 $as_echo_n "(cached) " >&6
else else
pgac_save_CFLAGS=$CFLAGS cat confdefs.h - <<_ACEOF >conftest.$ac_ext
CFLAGS="$pgac_save_CFLAGS -mpopcnt"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */ /* end confdefs.h. */
static int x = __builtin_popcount(255); static int x = __builtin_popcount(255);
_ACEOF _ACEOF
if ac_fn_c_try_compile "$LINENO"; then : if ac_fn_c_try_compile "$LINENO"; then :
pgac_cv_popcount=yes pgac_cv__builtin_popcount=yes
else else
pgac_cv_popcount=no pgac_cv__builtin_popcount=no
fi fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
CFLAGS="$pgac_save_CFLAGS"
fi fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_popcount" >&5 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__builtin_popcount" >&5
$as_echo "$pgac_cv_popcount" >&6; } $as_echo "$pgac_cv__builtin_popcount" >&6; }
if test x"$pgac_cv_popcount" = x"yes"; then if test x"$pgac_cv__builtin_popcount" = x"yes"; then
CFLAGS_POPCNT="-mpopcnt"
$as_echo "#define HAVE__BUILTIN_POPCOUNT 1" >>confdefs.h $as_echo "#define HAVE__BUILTIN_POPCOUNT 1" >>confdefs.h
fi fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_unreachable" >&5 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_unreachable" >&5
$as_echo_n "checking for __builtin_unreachable... " >&6; } $as_echo_n "checking for __builtin_unreachable... " >&6; }
if ${pgac_cv__builtin_unreachable+:} false; then : if ${pgac_cv__builtin_unreachable+:} false; then :
...@@ -14654,6 +14693,7 @@ $as_echo "#define LOCALE_T_IN_XLOCALE 1" >>confdefs.h ...@@ -14654,6 +14693,7 @@ $as_echo "#define LOCALE_T_IN_XLOCALE 1" >>confdefs.h
fi fi
have__builtin_popcount=$pgac_cv__builtin_popcount
# MSVC doesn't cope well with defining restrict to __restrict, the # MSVC doesn't cope well with defining restrict to __restrict, the
......
...@@ -547,6 +547,10 @@ elif test "$PORTNAME" = "hpux"; then ...@@ -547,6 +547,10 @@ elif test "$PORTNAME" = "hpux"; then
PGAC_PROG_CXX_CFLAGS_OPT([+Olibmerrno]) PGAC_PROG_CXX_CFLAGS_OPT([+Olibmerrno])
fi fi
# Optimization flags and options for bit-twiddling
PGAC_PROG_CC_VAR_OPT(CFLAGS_POPCNT, [-mpopcnt])
AC_SUBST(CFLAGS_POPCNT)
AC_SUBST(CFLAGS_VECTOR, $CFLAGS_VECTOR) AC_SUBST(CFLAGS_VECTOR, $CFLAGS_VECTOR)
# Determine flags used to emit bitcode for JIT inlining. Need to test # Determine flags used to emit bitcode for JIT inlining. Need to test
...@@ -1506,7 +1510,7 @@ AC_TYPE_LONG_LONG_INT ...@@ -1506,7 +1510,7 @@ AC_TYPE_LONG_LONG_INT
PGAC_TYPE_LOCALE_T PGAC_TYPE_LOCALE_T
AC_SUBST(CFLAGS_POPCNT) AC_SUBST(have__builtin_popcount, $pgac_cv__builtin_popcount)
# MSVC doesn't cope well with defining restrict to __restrict, the # MSVC doesn't cope well with defining restrict to __restrict, the
# spelling it understands, because it conflicts with # spelling it understands, because it conflicts with
......
...@@ -517,6 +517,9 @@ WIN32_STACK_RLIMIT=4194304 ...@@ -517,6 +517,9 @@ WIN32_STACK_RLIMIT=4194304
# Set if we have a working win32 crashdump header # Set if we have a working win32 crashdump header
have_win32_dbghelp = @have_win32_dbghelp@ have_win32_dbghelp = @have_win32_dbghelp@
# Set if __builtin_popcount() is supported by $(CC)
have__builtin_popcount = @have__builtin_popcount@
# Pull in platform-specific magic # Pull in platform-specific magic
include $(top_builddir)/src/Makefile.port include $(top_builddir)/src/Makefile.port
......
...@@ -10,17 +10,177 @@ ...@@ -10,17 +10,177 @@
* *
*------------------------------------------------------------------------ - *------------------------------------------------------------------------ -
*/ */
#ifndef PG_BITUTILS_H #ifndef PG_BITUTILS_H
#define PG_BITUTILS_H #define PG_BITUTILS_H
extern int (*pg_popcount32) (uint32 word); extern int (*pg_popcount32) (uint32 word);
extern int (*pg_popcount64) (uint64 word); extern int (*pg_popcount64) (uint64 word);
extern int (*pg_rightmost_one32) (uint32 word);
extern int (*pg_rightmost_one64) (uint64 word);
extern int (*pg_leftmost_one32) (uint32 word);
extern int (*pg_leftmost_one64) (uint64 word);
extern uint64 pg_popcount(const char *buf, int bytes); extern uint64 pg_popcount(const char *buf, int bytes);
/* in pg_bitutils_hwpopcnt.c */
extern int pg_popcount32_hw(uint32 word);
extern int pg_popcount64_hw(uint64 word);
#ifndef HAVE__BUILTIN_CTZ
/*
* Array marking the position of the right-most set bit for each value of
* 1-255. We count the right-most position as the 0th bit, and the
* left-most the 7th bit. The 0th index of the array must not be used.
*/
static const uint8 rightmost_one_pos[256] = {
0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
};
#endif /* !HAVE__BUILTIN_CTZ */
/*
* pg_rightmost_one32
* Returns the number of trailing 0-bits in word, starting at the least
* significant bit position. word must not be 0.
*/
static inline int
pg_rightmost_one32(uint32 word)
{
int result = 0;
Assert(word != 0);
#ifdef HAVE__BUILTIN_CTZ
result = __builtin_ctz(word);
#else
while ((word & 255) == 0)
{
word >>= 8;
result += 8;
}
result += rightmost_one_pos[word & 255];
#endif /* HAVE__BUILTIN_CTZ */
return result;
}
/*
* pg_rightmost_one64
* Returns the number of trailing 0-bits in word, starting at the least
* significant bit position. word must not be 0.
*/
static inline int
pg_rightmost_one64(uint64 word)
{
int result = 0;
Assert(word != 0);
#ifdef HAVE__BUILTIN_CTZ
#if defined(HAVE_LONG_INT_64)
return __builtin_ctzl(word);
#elif defined(HAVE_LONG_LONG_INT_64)
return __builtin_ctzll(word);
#else
#error must have a working 64-bit integer datatype
#endif
#else /* HAVE__BUILTIN_CTZ */
while ((word & 255) == 0)
{
word >>= 8;
result += 8;
}
result += rightmost_one_pos[word & 255];
#endif
return result;
}
#ifndef HAVE__BUILTIN_CLZ
/*
* Array marking the position of the left-most set bit for each value of
* 1-255. We count the right-most position as the 0th bit, and the
* left-most the 7th bit. The 0th index of the array must not be used.
*/
static const uint8 leftmost_one_pos[256] = {
0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
};
#endif /* !HAVE_BUILTIN_CLZ */
/*
* pg_leftmost_one32
* Returns the 0-based position of the most significant set bit in word
* measured from the least significant bit. word must not be 0.
*/
static inline int
pg_leftmost_one32(uint32 word)
{
#ifdef HAVE__BUILTIN_CLZ
Assert(word != 0);
return 31 - __builtin_clz(word);
#else
int shift = 32 - 8;
Assert(word != 0);
while ((word >> shift) == 0)
shift -= 8;
return shift + leftmost_one_pos[(word >> shift) & 255];
#endif /* HAVE__BUILTIN_CLZ */
}
/*
* pg_leftmost_one64
* Returns the 0-based position of the most significant set bit in word
* measured from the least significant bit. word must not be 0.
*/
static inline int
pg_leftmost_one64(uint64 word)
{
#ifdef HAVE__BUILTIN_CLZ
Assert(word != 0);
#if defined(HAVE_LONG_INT_64)
return 63 - __builtin_clzl(word);
#elif defined(HAVE_LONG_LONG_INT_64)
return 63 - __builtin_clzll(word);
#else
#error must have a working 64-bit integer datatype
#endif
#else /* HAVE__BUILTIN_CLZ */
int shift = 64 - 8;
Assert(word != 0);
while ((word >> shift) == 0)
shift -= 8;
return shift + leftmost_one_pos[(word >> shift) & 255];
#endif /* !HAVE__BUIILTIN_CLZ */
}
#endif /* PG_BITUTILS_H */ #endif /* PG_BITUTILS_H */
...@@ -41,6 +41,14 @@ OBJS = $(LIBOBJS) $(PG_CRC32C_OBJS) chklocale.o erand48.o inet_net_ntop.o \ ...@@ -41,6 +41,14 @@ OBJS = $(LIBOBJS) $(PG_CRC32C_OBJS) chklocale.o erand48.o inet_net_ntop.o \
qsort.o qsort_arg.o quotes.o snprintf.o sprompt.o strerror.o \ qsort.o qsort_arg.o quotes.o snprintf.o sprompt.o strerror.o \
tar.o thread.o tar.o thread.o
# If the compiler supports a special flag for the POPCOUNT instruction and it
# has __builtin_popcount, add pg_bitutils_hwpopcnt.o.
ifneq ($(CFLAGS_POPCNT),)
ifeq ($(have__builtin_popcount),yes)
OBJS += pg_bitutils_hwpopcnt.o
endif
endif
# libpgport.a, libpgport_shlib.a, and libpgport_srv.a contain the same files # libpgport.a, libpgport_shlib.a, and libpgport_srv.a contain the same files
# foo.o, foo_shlib.o, and foo_srv.o are all built from foo.c # foo.o, foo_shlib.o, and foo_srv.o are all built from foo.c
OBJS_SHLIB = $(OBJS:%.o=%_shlib.o) OBJS_SHLIB = $(OBJS:%.o=%_shlib.o)
...@@ -78,10 +86,10 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_ARMV8_CRC32C) ...@@ -78,10 +86,10 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_ARMV8_CRC32C)
pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_ARMV8_CRC32C) pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_ARMV8_CRC32C)
pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_ARMV8_CRC32C) pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_ARMV8_CRC32C)
# pg_bitutils.c needs CFLAGS_POPCNT # all versions of pg_bitutils_hwpopcnt.c need CFLAGS_POPCNT
pg_bitutils.o: CFLAGS+=$(CFLAGS_POPCNT) pg_bitutils_hwpopcnt.o: CFLAGS+=$(CFLAGS_POPCNT)
pg_bitutils_shlib.o: CFLAGS+=$(CFLAGS_POPCNT) pg_bitutils_hwpopcnt_shlib.o: CFLAGS+=$(CFLAGS_POPCNT)
pg_bitutils_srv.o: CFLAGS+=$(CFLAGS_POPCNT) pg_bitutils_hwpopcnt_srv.o: CFLAGS+=$(CFLAGS_POPCNT)
# #
# Shared library versions of object files # Shared library versions of object files
......
...@@ -10,7 +10,6 @@ ...@@ -10,7 +10,6 @@
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
#include "postgres.h" #include "postgres.h"
#ifdef HAVE__GET_CPUID #ifdef HAVE__GET_CPUID
...@@ -23,61 +22,21 @@ ...@@ -23,61 +22,21 @@
#include "port/pg_bitutils.h" #include "port/pg_bitutils.h"
#if defined(HAVE__BUILTIN_POPCOUNT) && defined(HAVE__GET_CPUID) #ifdef HAVE__BUILTIN_POPCOUNT
static bool pg_popcount_available(void); static bool pg_popcount_available(void);
static int pg_popcount32_choose(uint32 word); static int pg_popcount32_choose(uint32 word);
static int pg_popcount32_sse42(uint32 word); static int pg_popcount32_builtin(uint32 word);
static int pg_popcount64_choose(uint64 word); static int pg_popcount64_choose(uint64 word);
static int pg_popcount64_sse42(uint64 word); static int pg_popcount64_builtin(uint64 word);
#endif int (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
static int pg_popcount32_slow(uint32 word); int (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
static int pg_popcount64_slow(uint64 word);
#if defined(HAVE__GET_CPUID) && (defined(HAVE__BUILTIN_CTZ) || defined(HAVE__BUILTIN_CLZ))
static bool pg_lzcnt_available(void);
#endif
#if defined(HAVE__BUILTIN_CTZ) && defined(HAVE__GET_CPUID)
static int pg_rightmost_one32_choose(uint32 word);
static int pg_rightmost_one32_abm(uint32 word);
static int pg_rightmost_one64_choose(uint64 word);
static int pg_rightmost_one64_abm(uint64 word);
#endif
static int pg_rightmost_one32_slow(uint32 word);
static int pg_rightmost_one64_slow(uint64 word);
#if defined(HAVE__BUILTIN_CLZ) && defined(HAVE__GET_CPUID)
static int pg_leftmost_one32_choose(uint32 word);
static int pg_leftmost_one32_abm(uint32 word);
static int pg_leftmost_one64_choose(uint64 word);
static int pg_leftmost_one64_abm(uint64 word);
#endif
static int pg_leftmost_one32_slow(uint32 word);
static int pg_leftmost_one64_slow(uint64 word);
#if defined(HAVE__BUILTIN_POPCOUNT) && defined(HAVE__GET_CPUID)
int (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
int (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
#else
int (*pg_popcount32) (uint32 word) = pg_popcount32_slow;
int (*pg_popcount64) (uint64 word) = pg_popcount64_slow;
#endif
#if defined(HAVE__BUILTIN_CTZ) && defined(HAVE__GET_CPUID)
int (*pg_rightmost_one32) (uint32 word) = pg_rightmost_one32_choose;
int (*pg_rightmost_one64) (uint64 word) = pg_rightmost_one64_choose;
#else #else
int (*pg_rightmost_one32) (uint32 word) = pg_rightmost_one32_slow; static int pg_popcount32_slow(uint32 word);
int (*pg_rightmost_one64) (uint64 word) = pg_rightmost_one64_slow; static int pg_popcount64_slow(uint64 word);
#endif int (*pg_popcount32) (uint32 word) = pg_popcount32_slow;
int (*pg_popcount64) (uint64 word) = pg_popcount64_slow;
#endif /* !HAVE_BUILTIN_POPCOUNT */
#if defined(HAVE__BUILTIN_CLZ) && defined(HAVE__GET_CPUID)
int (*pg_leftmost_one32) (uint32 word) = pg_leftmost_one32_choose;
int (*pg_leftmost_one64) (uint64 word) = pg_leftmost_one64_choose;
#else
int (*pg_leftmost_one32) (uint32 word) = pg_leftmost_one32_slow;
int (*pg_leftmost_one64) (uint64 word) = pg_leftmost_one64_slow;
#endif
/* Array marking the number of 1-bits for each value of 0-255. */ /* Array marking the number of 1-bits for each value of 0-255. */
static const uint8 number_of_ones[256] = { static const uint8 number_of_ones[256] = {
...@@ -100,96 +59,51 @@ static const uint8 number_of_ones[256] = { ...@@ -100,96 +59,51 @@ static const uint8 number_of_ones[256] = {
}; };
/* /*
* Array marking the position of the right-most set bit for each value of * Return true iff we have CPUID support and it indicates that the POPCNT
* 1-255. We count the right-most position as the 0th bit, and the * instruction is available.
* left-most the 7th bit. The 0th index of the array must not be used.
*/ */
static const uint8 rightmost_one_pos[256] = {
0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
};
/*
* Array marking the position of the left-most set bit for each value of
* 1-255. We count the right-most position as the 0th bit, and the
* left-most the 7th bit. The 0th index of the array must not be used.
*/
static const uint8 leftmost_one_pos[256] = {
0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
};
#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_POPCOUNT)
static bool static bool
pg_popcount_available(void) pg_popcount_available(void)
{ {
unsigned int exx[4] = { 0, 0, 0, 0 }; #if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
unsigned int exx[4] = {0, 0, 0, 0};
#if defined(HAVE__GET_CPUID) #if defined(HAVE__GET_CPUID)
__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
#elif defined(HAVE__CPUID) #elif defined(HAVE__CPUID)
__cpuid(exx, 1); __cpuid(exx, 1);
#else
#error cpuid instruction not available
#endif #endif
return (exx[2] & (1 << 23)) != 0; /* POPCNT */ return (exx[2] & (1 << 23)) != 0; /* POPCNT */
} #else /* HAVE__GET_CPUID || HAVE__CPUID */
#endif
#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_POPCOUNT) return false;
#endif
}
#ifdef HAVE__BUILTIN_POPCOUNT
/* /*
* This gets called on the first call. It replaces the function pointer * This gets called on the first call to pg_popcount32. It replaces the
* so that subsequent calls are routed directly to the chosen implementation. * function pointer so that subsequent calls are routed directly to the chosen
* implementation.
*/ */
static int static int
pg_popcount32_choose(uint32 word) pg_popcount32_choose(uint32 word)
{ {
if (pg_popcount_available()) if (pg_popcount_available())
pg_popcount32 = pg_popcount32_sse42; pg_popcount32 = pg_popcount32_hw;
else else
pg_popcount32 = pg_popcount32_slow; pg_popcount32 = pg_popcount32_builtin;
return pg_popcount32(word); return pg_popcount32(word);
} }
static int static int
pg_popcount32_sse42(uint32 word) pg_popcount32_builtin(uint32 word)
{ {
return __builtin_popcount(word); return __builtin_popcount(word);
} }
#endif #else /* HAVE__BUILTIN_POPCOUNT */
/* /*
* pg_popcount32_slow * pg_popcount32_slow
* Return the number of 1 bits set in word * Return the number of 1 bits set in word
...@@ -197,7 +111,7 @@ pg_popcount32_sse42(uint32 word) ...@@ -197,7 +111,7 @@ pg_popcount32_sse42(uint32 word)
static int static int
pg_popcount32_slow(uint32 word) pg_popcount32_slow(uint32 word)
{ {
int result = 0; int result = 0;
while (word != 0) while (word != 0)
{ {
...@@ -207,6 +121,7 @@ pg_popcount32_slow(uint32 word) ...@@ -207,6 +121,7 @@ pg_popcount32_slow(uint32 word)
return result; return result;
} }
#endif
/* /*
* pg_popcount * pg_popcount
...@@ -215,13 +130,13 @@ pg_popcount32_slow(uint32 word) ...@@ -215,13 +130,13 @@ pg_popcount32_slow(uint32 word)
uint64 uint64
pg_popcount(const char *buf, int bytes) pg_popcount(const char *buf, int bytes)
{ {
uint64 popcnt = 0; uint64 popcnt = 0;
#if SIZEOF_VOID_P >= 8 #if SIZEOF_VOID_P >= 8
/* Process in 64-bit chunks if the buffer is aligned. */ /* Process in 64-bit chunks if the buffer is aligned. */
if (buf == (char *) TYPEALIGN(8, buf)) if (buf == (char *) TYPEALIGN(8, buf))
{ {
uint64 *words = (uint64 *) buf; uint64 *words = (uint64 *) buf;
while (bytes >= 8) while (bytes >= 8)
{ {
...@@ -235,7 +150,7 @@ pg_popcount(const char *buf, int bytes) ...@@ -235,7 +150,7 @@ pg_popcount(const char *buf, int bytes)
/* Process in 32-bit chunks if the buffer is aligned. */ /* Process in 32-bit chunks if the buffer is aligned. */
if (buf == (char *) TYPEALIGN(4, buf)) if (buf == (char *) TYPEALIGN(4, buf))
{ {
uint32 *words = (uint32 *) buf; uint32 *words = (uint32 *) buf;
while (bytes >= 4) while (bytes >= 4)
{ {
...@@ -254,38 +169,36 @@ pg_popcount(const char *buf, int bytes) ...@@ -254,38 +169,36 @@ pg_popcount(const char *buf, int bytes)
return popcnt; return popcnt;
} }
#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_POPCOUNT) #ifdef HAVE__BUILTIN_POPCOUNT
/* /*
* This gets called on the first call. It replaces the function pointer * This gets called on the first call to pg_popcount64. It replaces the
* so that subsequent calls are routed directly to the chosen implementation. * function pointer so that subsequent calls are routed directly to the chosen
* implementation.
*/ */
static int static int
pg_popcount64_choose(uint64 word) pg_popcount64_choose(uint64 word)
{ {
if (pg_popcount_available()) if (pg_popcount_available())
pg_popcount64 = pg_popcount64_sse42; pg_popcount64 = pg_popcount64_hw;
else else
pg_popcount64 = pg_popcount64_slow; pg_popcount64 = pg_popcount64_builtin;
return pg_popcount64(word); return pg_popcount64(word);
} }
static int static int
pg_popcount64_sse42(uint64 word) pg_popcount64_builtin(uint64 word)
{ {
#if defined(HAVE_LONG_INT_64) #if defined(HAVE_LONG_INT_64)
return __builtin_popcountl(word); return __builtin_popcountl(word);
#elif defined(HAVE_LONG_LONG_INT_64) #elif defined(HAVE_LONG_LONG_INT_64)
return __builtin_popcountll(word); return __builtin_popcountll(word);
#else #else
/* shouldn't happen */
#error must have a working 64-bit integer datatype #error must have a working 64-bit integer datatype
#endif #endif
} }
#endif #else /* HAVE__BUILTIN_POPCOUNT */
/* /*
* pg_popcount64_slow * pg_popcount64_slow
* Return the number of 1 bits set in word * Return the number of 1 bits set in word
...@@ -293,7 +206,7 @@ pg_popcount64_sse42(uint64 word) ...@@ -293,7 +206,7 @@ pg_popcount64_sse42(uint64 word)
static int static int
pg_popcount64_slow(uint64 word) pg_popcount64_slow(uint64 word)
{ {
int result = 0; int result = 0;
while (word != 0) while (word != 0)
{ {
...@@ -303,211 +216,4 @@ pg_popcount64_slow(uint64 word) ...@@ -303,211 +216,4 @@ pg_popcount64_slow(uint64 word)
return result; return result;
} }
#if defined(HAVE__GET_CPUID) && (defined(HAVE__BUILTIN_CTZ) || defined(HAVE__BUILTIN_CLZ))
static bool
pg_lzcnt_available(void)
{
unsigned int exx[4] = { 0, 0, 0, 0 };
#if defined(HAVE__GET_CPUID)
__get_cpuid(0x80000001, &exx[0], &exx[1], &exx[2], &exx[3]);
#elif defined(HAVE__CPUID)
__cpuid(exx, 0x80000001);
#else
#error cpuid instruction not available
#endif
return (exx[2] & (1 << 5)) != 0; /* LZCNT */
}
#endif
#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_CTZ)
/*
* This gets called on the first call. It replaces the function pointer
* so that subsequent calls are routed directly to the chosen implementation.
*/
static int
pg_rightmost_one32_choose(uint32 word)
{
if (pg_lzcnt_available())
pg_rightmost_one32 = pg_rightmost_one32_abm;
else
pg_rightmost_one32 = pg_rightmost_one32_slow;
return pg_rightmost_one32(word);
}
static int
pg_rightmost_one32_abm(uint32 word)
{
return __builtin_ctz(word);
}
#endif #endif
/*
* pg_rightmost_one32_slow
* Returns the number of trailing 0-bits in word, starting at the least
* significant bit position. word must not be 0.
*/
static int
pg_rightmost_one32_slow(uint32 word)
{
int result = 0;
Assert(word != 0);
while ((word & 255) == 0)
{
word >>= 8;
result += 8;
}
result += rightmost_one_pos[word & 255];
return result;
}
#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_CTZ)
/*
* This gets called on the first call. It replaces the function pointer
* so that subsequent calls are routed directly to the chosen implementation.
*/
static int
pg_rightmost_one64_choose(uint64 word)
{
if (pg_lzcnt_available())
pg_rightmost_one64 = pg_rightmost_one64_abm;
else
pg_rightmost_one64 = pg_rightmost_one64_slow;
return pg_rightmost_one64(word);
}
static int
pg_rightmost_one64_abm(uint64 word)
{
#if defined(HAVE_LONG_INT_64)
return __builtin_ctzl(word);
#elif defined(HAVE_LONG_LONG_INT_64)
return __builtin_ctzll(word);
#else
/* shouldn't happen */
#error must have a working 64-bit integer datatype
#endif
}
#endif
/*
* pg_rightmost_one64_slow
* Returns the number of trailing 0-bits in word, starting at the least
* significant bit position. word must not be 0.
*/
static int
pg_rightmost_one64_slow(uint64 word)
{
int result = 0;
Assert(word != 0);
while ((word & 255) == 0)
{
word >>= 8;
result += 8;
}
result += rightmost_one_pos[word & 255];
return result;
}
#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_CLZ)
/*
* This gets called on the first call. It replaces the function pointer
* so that subsequent calls are routed directly to the chosen implementation.
*/
static int
pg_leftmost_one32_choose(uint32 word)
{
if (pg_lzcnt_available())
pg_leftmost_one32 = pg_leftmost_one32_abm;
else
pg_leftmost_one32 = pg_leftmost_one32_slow;
return pg_leftmost_one32(word);
}
static int
pg_leftmost_one32_abm(uint32 word)
{
return 31 - __builtin_clz(word);
}
#endif
/*
* pg_leftmost_one32_slow
* Returns the 0-based position of the most significant set bit in word
* measured from the least significant bit. word must not be 0.
*/
static int
pg_leftmost_one32_slow(uint32 word)
{
int shift = 32 - 8;
Assert(word != 0);
while ((word >> shift) == 0)
shift -= 8;
return shift + leftmost_one_pos[(word >> shift) & 255];
}
#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_CLZ)
/*
* This gets called on the first call. It replaces the function pointer
* so that subsequent calls are routed directly to the chosen implementation.
*/
static int
pg_leftmost_one64_choose(uint64 word)
{
if (pg_lzcnt_available())
pg_leftmost_one64 = pg_leftmost_one64_abm;
else
pg_leftmost_one64 = pg_leftmost_one64_slow;
return pg_leftmost_one64(word);
}
static int
pg_leftmost_one64_abm(uint64 word)
{
#if defined(HAVE_LONG_INT_64)
return 63 - __builtin_clzl(word);
#elif defined(HAVE_LONG_LONG_INT_64)
return 63 - __builtin_clzll(word);
#else
/* shouldn't happen */
#error must have a working 64-bit integer datatype
#endif
}
#endif
/*
* pg_leftmost_one64_slow
* Returns the 0-based position of the most significant set bit in word
* measured from the least significant bit. word must not be 0.
*/
static int
pg_leftmost_one64_slow(uint64 word)
{
int shift = 64 - 8;
Assert(word != 0);
while ((word >> shift) == 0)
shift -= 8;
return shift + leftmost_one_pos[(word >> shift) & 255];
}
/*-------------------------------------------------------------------------
*
* pg_bitutils_hwpopcnt.c
* CPU-optimized implementation of pg_popcount variants
*
* This file must be compiled with a compiler-specific flag to enable the
* POPCNT instruction.
*
* Portions Copyright (c) 2019, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/port/pg_bitutils_hwpopcnt.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "port/pg_bitutils.h"
int
pg_popcount32_hw(uint32 word)
{
return __builtin_popcount(word);
}
int
pg_popcount64_hw(uint64 word)
{
#if defined(HAVE_LONG_INT_64)
return __builtin_popcountl(word);
#elif defined(HAVE_LONG_LONG_INT_64)
return __builtin_popcountll(word);
#else
#error must have a working 64-bit integer datatype
#endif
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment