Commit 1db943b3 authored by Marc G. Fournier's avatar Marc G. Fournier

commit Oleg and Teodor's RD-tree implementation ... this provides the
regression tests for the GiST changes ... this should be integrated into
the regular regression tests similar to Vadim's SPI contrib stuff ...
parent 0ad7db4b
subdir = contrib/intarray
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
# override libdir to install shlib in contrib not main directory
libdir := $(libdir)/contrib
# shared library parameters
NAME= _int
SO_MAJOR_VERSION= 1
SO_MINOR_VERSION= 0
override CPPFLAGS += -I$(srcdir) -DPGSQL71
OBJS= _int.o
all: all-lib $(NAME).sql
# Shared library stuff
include $(top_srcdir)/src/Makefile.shlib
$(NAME).sql: $(NAME).sql.in
sed -e 's:MODULE_PATHNAME:$(libdir)/$(shlib):g' < $< > $@
.PHONY: submake
submake:
$(MAKE) -C $(top_builddir)/src/test/regress pg_regress
# against installed postmaster
installcheck: submake
@echo "'make installcheck' is not supported."
installcheck: submake
$(top_builddir)/src/test/regress/pg_regress _int
# in-tree test doesn't work yet (no way to install my shared library)
#check: all submake
# $(top_builddir)/src/test/regress/pg_regress --temp-install \
# --top-builddir=$(top_builddir) _int
check:
@echo "'make check' is not supported."
@echo "Do 'make install', then 'make installcheck' instead."
install: all installdirs install-lib
#$(INSTALL_DATA) $(srcdir)/README.$(NAME) $(docdir)/contrib
$(INSTALL_DATA) $(NAME).sql $(datadir)/contrib
installdirs:
$(mkinstalldirs) $(docdir)/contrib $(datadir)/contrib $(libdir)
uninstall: uninstall-lib
rm -f $(docdir)/contrib/README.$(NAME) $(datadir)/contrib/$(NAME).sql
clean distclean maintainer-clean: clean-lib
rm -f *.so y.tab.c y.tab.h $(OBJS) $(NAME).sql
# things created by various check targets
rm -rf results tmp_check log
rm -f regression.diffs regression.out regress.out run_check.out
ifeq ($(PORTNAME), win)
rm -f regress.def
endif
depend dep:
$(CC) -MM $(CFLAGS) *.c >depend
ifeq (depend,$(wildcard depend))
include depend
endif
#-------------------------------------------------------------------------
#
# Makefile --
#
# Makefile for Enzyme Commission catalogue number type -- ec_code
#
#-------------------------------------------------------------------------
PGDIR = ../..
SRCDIR = $(PGDIR)/src
include $(SRCDIR)/Makefile.global
INCLUDE_OPT = -I ./ \
-I $(SRCDIR)/ \
-I $(SRCDIR)/include \
-I $(SRCDIR)/port/$(PORTNAME)
CFLAGS += $(INCLUDE_OPT) $(CFLAGS_SL)
MODNAME = _int
OBJFILES = $(MODNAME).o
SQLDEFS = $(MODNAME).sql
MODULE = $(MODNAME)$(DLSUFFIX)
MODDIR = $(LIBDIR)/modules
SQLDIR = $(LIBDIR)/sql
all: module sql
module: $(MODULE)
sql: $(SQLDEFS)
$(MODULE): $(OBJFILES)
$(CC) $(CFLAGS) -shared -o $@ $(OBJFILES)
install: $(MODULE) $(SQLDEFS) $(MODDIR) $(SQLDIR)
cp -p $(MODULE) $(MODDIR)/
strip $(MODDIR)/$(MODULE)
cp -p $(SQLDEFS) $(SQLDIR)/
$(MODDIR):
mkdir -p $@
$(SQLDIR):
mkdir -p $@
%.sql: %.sql.in
sed "s|MODULE_PATHNAME|$(MODDIR)/$(MODULE)|" < $< > $@
depend dep:
$(CC) -MM $(INCLUDE_OPT) *.c >depend
clean:
rm -f $(MODULE) $(SQLDEFS) *$(DLSUFFIX)
rm -f *~ *# *.b *.o *.output *.tab.h $(MODNAME)parse.h $(MODNAME)parse.c $(MODNAME)scan.c
ifeq (depend,$(wildcard depend))
include depend
endif
This is an implementation of RD-tree data structure using GiST interface
of PostgreSQL. It has built-in lossy compression - must be declared
in index creation - with (islossy). Current implementation has index support
for one-dimensional array of int4's.
All work was done by Teodor Sigaev (teodor@stack.net) and Oleg Bartunov
(oleg@sai.msu.su). See http://www.sai.msu.su/~megera/postgres/gist
for additional information.
INSTALLATION:
gmake
gmake install
-- load functions
psql <database> < _int.sql
REGRESSION TEST:
gmake installcheck
EXAMPLE USAGE:
create table message (mid int not null,sections int[]);
create table message_section_map (mid int not null,sid int not null);
-- create indices
CREATE unique index message_key on message ( mid );
CREATE unique index message_section_map_key2 on message_section_map (sid, mid );
CREATE INDEX message_rdtree_idx on message using gist ( sections ) with ( islossy );
-- select some messages with section in 1 OR 2 - OVERLAP operator
select message.mid from message where message.sections && '{1,2}';
-- select messages contains in sections 1 AND 2 - CONTAINS operator
select message.mid from message where message.sections @ '{1,2}';
-- the same, CONTAINED operator
select message.mid from message where '{1,2}' ~ message.sections;
BENCHMARK:
subdirectory bench contains benchmark suite.
cd ./bench
1. createdb TEST
2. psql TEST < ../_int.sql
3. ./create_test.pl | psql TEST
4. ./bench.pl - perl script to benchmark queries, supports OR, AND queries
with/without RD-Tree. Run script without arguments to
see availbale options.
a)test without RD-Tree (OR)
./bench.pl -d TEST -s 1,2 -v
b)test with RD-Tree
./bench.pl -d TEST -s 1,2 -v -r
BENCHMARKS:
Size of table <message>: 200000
Size of table <message_section_map>: 268538
Distribution of messages by sections:
section 0: 73899 messages
section 1: 16298 messages
section 50: 1241 messages
section 99: 705 messages
old - without RD-Tree support,
new - with RD-Tree
+----------+---------------+----------------+
|Search set|OR, time in sec|AND, time in sec|
| +-------+-------+--------+-------+
| | old | new | old | new |
+----------+-------+-------+--------+-------+
| 1| 1.427| 0.215| -| -|
+----------+-------+-------+--------+-------+
| 99| 1.029| 0.018| -| -|
+----------+-------+-------+--------+-------+
| 1,2| 1.829| 0.334| 5.654| 0.042|
+----------+-------+-------+--------+-------+
| 1,2,50,60| 2.057| 0.359| 5.044| 0.007|
+----------+-------+-------+--------+-------+
This diff is collapsed.
-- Create the user-defined type for the 1-D frloating point indervals (_int4)
--
BEGIN TRANSACTION;
--
-- External C-functions for R-tree methods
--
-- Comparison methods
CREATE FUNCTION _int_contains(_int4, _int4) RETURNS bool
AS 'MODULE_PATHNAME' LANGUAGE 'c';
INSERT INTO pg_description (objoid, description)
SELECT oid, 'contains'::text
FROM pg_proc
WHERE proname = '_int_contains'::name;
CREATE FUNCTION _int_contained(_int4, _int4) RETURNS bool
AS 'MODULE_PATHNAME' LANGUAGE 'c';
INSERT INTO pg_description (objoid, description)
SELECT oid, 'contained in'::text
FROM pg_proc
WHERE proname = '_int_contained'::name;
CREATE FUNCTION _int_overlap(_int4, _int4) RETURNS bool
AS 'MODULE_PATHNAME' LANGUAGE 'c';
INSERT INTO pg_description (objoid, description)
SELECT oid, 'overlaps'::text
FROM pg_proc
WHERE proname = '_int_overlap'::name;
CREATE FUNCTION _int_same(_int4, _int4) RETURNS bool
AS 'MODULE_PATHNAME' LANGUAGE 'c';
INSERT INTO pg_description (objoid, description)
SELECT oid, 'same as'::text
FROM pg_proc
WHERE proname = '_int_same'::name;
CREATE FUNCTION _int_different(_int4, _int4) RETURNS bool
AS 'MODULE_PATHNAME' LANGUAGE 'c';
INSERT INTO pg_description (objoid, description)
SELECT oid, 'different'::text
FROM pg_proc
WHERE proname = '_int_different'::name;
-- support routines for indexing
CREATE FUNCTION _int_union(_int4, _int4) RETURNS _int4
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION _int_inter(_int4, _int4) RETURNS _int4
AS 'MODULE_PATHNAME' LANGUAGE 'c';
--
-- OPERATORS
--
CREATE OPERATOR && (
LEFTARG = _int4, RIGHTARG = _int4, PROCEDURE = _int_overlap,
COMMUTATOR = '&&',
RESTRICT = contsel, JOIN = contjoinsel
);
--CREATE OPERATOR = (
-- LEFTARG = _int4, RIGHTARG = _int4, PROCEDURE = _int_same,
-- COMMUTATOR = '=', NEGATOR = '<>',
-- RESTRICT = eqsel, JOIN = eqjoinsel,
-- SORT1 = '<', SORT2 = '<'
--);
CREATE OPERATOR <> (
LEFTARG = _int4, RIGHTARG = _int4, PROCEDURE = _int_different,
COMMUTATOR = '<>', NEGATOR = '=',
RESTRICT = neqsel, JOIN = neqjoinsel
);
CREATE OPERATOR @ (
LEFTARG = _int4, RIGHTARG = _int4, PROCEDURE = _int_contains,
COMMUTATOR = '~', RESTRICT = contsel, JOIN = contjoinsel
);
CREATE OPERATOR ~ (
LEFTARG = _int4, RIGHTARG = _int4, PROCEDURE = _int_contained,
COMMUTATOR = '@', RESTRICT = contsel, JOIN = contjoinsel
);
-- define the GiST support methods
CREATE FUNCTION g_int_consistent(opaque,_int4,int4) RETURNS bool
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION g_int_compress(opaque) RETURNS opaque
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION g_int_decompress(opaque) RETURNS opaque
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION g_int_penalty(opaque,opaque,opaque) RETURNS opaque
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION g_int_picksplit(opaque, opaque) RETURNS opaque
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION g_int_union(bytea, opaque) RETURNS _int4
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION g_int_same(_int4, _int4, opaque) RETURNS opaque
AS 'MODULE_PATHNAME' LANGUAGE 'c';
-- register the default opclass for indexing
INSERT INTO pg_opclass (opcname, opcdeftype)
SELECT 'gist__int_ops', oid
FROM pg_type
WHERE typname = '_int4';
-- get the comparators for _intments and store them in a tmp table
SELECT o.oid AS opoid, o.oprname
INTO TABLE _int_ops_tmp
FROM pg_operator o, pg_type t
WHERE o.oprleft = t.oid and o.oprright = t.oid
and t.typname = '_int4';
-- make sure we have the right operators
-- SELECT * from _int_ops_tmp;
-- using the tmp table, generate the amop entries
-- _int_overlap
INSERT INTO pg_amop (amopid, amopclaid, amopopr, amopstrategy)
SELECT am.oid, opcl.oid, c.opoid, 3
FROM pg_am am, pg_opclass opcl, _int_ops_tmp c
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and c.oprname = '&&';
-- _int_same
INSERT INTO pg_amop (amopid, amopclaid, amopopr, amopstrategy)
SELECT am.oid, opcl.oid, c.opoid, 6
FROM pg_am am, pg_opclass opcl, _int_ops_tmp c
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and c.oprname = '=';
-- _int_contains
INSERT INTO pg_amop (amopid, amopclaid, amopopr, amopstrategy)
SELECT am.oid, opcl.oid, c.opoid, 7
FROM pg_am am, pg_opclass opcl, _int_ops_tmp c
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and c.oprname = '@';
-- _int_contained
INSERT INTO pg_amop (amopid, amopclaid, amopopr, amopstrategy)
SELECT am.oid, opcl.oid, c.opoid, 8
FROM pg_am am, pg_opclass opcl, _int_ops_tmp c
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and c.oprname = '~';
DROP TABLE _int_ops_tmp;
-- add the entries to amproc for the support methods
-- note the amprocnum numbers associated with each are specific!
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 1
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_consistent';
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 2
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_union';
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 3
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_compress';
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 4
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_decompress';
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 5
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_penalty';
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 6
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_picksplit';
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 7
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_same';
END TRANSACTION;
#!/usr/bin/perl
use strict;
# make sure we are in a sane environment.
use DBI();
use DBD::Pg();
use Time::HiRes qw( usleep ualarm gettimeofday tv_interval );
use Getopt::Std;
my %opt;
getopts('d:b:s:veorauc', \%opt);
if ( !( scalar %opt && defined $opt{s} ) ) {
print <<EOT;
Usage:
$0 -d DATABASE -s SECTIONS [-b NUMBER] [-v] [-e] [-o] [-r] [-a] [-u]
-d DATABASE -DATABASE
-b NUMBER -number of repeats
-s SECTIONS -sections, format sid1[,sid2[,sid3[...]]]]
-v -verbose (show SQL)
-e -show explain
-r -use RD-tree index
-a -AND section
-o -show output
-u -unique
-c -count
EOT
exit;
}
$opt{d} ||= '_int4';
my $dbi=DBI->connect('DBI:Pg:dbname='.$opt{d});
my %table;
my @where;
$table{message}=1;
if ( $opt{a} ) {
if ( $opt{r} ) {
push @where, "message.sections @ '{$opt{s}}'";
} else {
foreach my $sid ( split(/[,\s]+/, $opt{s} )) {
push @where, "EXISTS ( select message_section_map.mid from message_section_map where message.mid=message_section_map.mid and message_section_map.sid = $sid )";
}
}
} else {
if ( $opt{r} ) {
push @where, "message.sections && '{$opt{s}}'";
} else {
$table{message_section_map} = 1;
push @where, "message.mid = message_section_map.mid";
push @where, "message_section_map.sid in ($opt{s})";
}
}
my $outf;
if ( $opt{c} ) {
$outf = ( $opt{u} ) ? 'count( distinct message.mid )' : 'count( message.mid )';
} else {
$outf = ( $opt{u} ) ? 'distinct( message.mid )' : 'message.mid';
}
my $sql = "select $outf from ".join(', ', keys %table)." where ".join(' AND ', @where).';';
if ( $opt{v} ) {
print "$sql\n";
}
if ( $opt{e} ) {
$dbi->do("explain $sql");
}
my $t0 = [gettimeofday];
my $count=0;
my $b=$opt{b};
$b||=1;
my @a;
foreach ( 1..$b ) {
@a=exec_sql($dbi,$sql);
$count=$#a;
}
my $elapsed = tv_interval ( $t0, [gettimeofday]);
if ( $opt{o} ) {
foreach ( @a ) {
print "$_->{mid}\t$_->{sections}\n";
}
}
print sprintf("total: %.02f sec; number: %d; for one: %.03f sec; found %d docs\n", $elapsed, $b, $elapsed/$b, $count+1 );
$dbi -> disconnect;
sub exec_sql {
my ($dbi, $sql, @keys) = @_;
my $sth=$dbi->prepare($sql) || die;
$sth->execute( @keys ) || die;
my $r;
my @row;
while ( defined ( $r=$sth->fetchrow_hashref ) ) {
push @row, $r;
}
$sth->finish;
return @row;
}
#!/usr/bin/perl
use strict;
print <<EOT;
create table message (
mid int not null,
sections int[]
);
create table message_section_map (
mid int not null,
sid int not null
);
EOT
open(MSG,">message.tmp") || die;
open(MAP,">message_section_map.tmp") || die;
srand( 1 );
#foreach my $i ( 1..1778 ) {
#foreach my $i ( 1..3443 ) {
#foreach my $i ( 1..5000 ) {
#foreach my $i ( 1..29362 ) {
#foreach my $i ( 1..33331 ) {
#foreach my $i ( 1..83268 ) {
foreach my $i ( 1..200000 ) {
my @sect;
if ( rand() < 0.7 ) {
$sect[0] = int( (rand()**4)*100 );
} else {
my %hash;
@sect = grep { $hash{$_}++; $hash{$_} <= 1 } map { int( (rand()**4)*100) } 0..( int(rand()*5) );
}
if ( $#sect < 0 || rand() < 0.1 ) {
print MSG "$i\t\\N\n";
} else {
print MSG "$i\t{".join(',',@sect)."}\n";
map { print MAP "$i\t$_\n" } @sect;
}
}
close MAP;
close MSG;
copytable('message');
copytable('message_section_map');
print <<EOT;
CREATE unique index message_key on message ( mid );
--CREATE unique index message_section_map_key1 on message_section_map ( mid, sid );
CREATE unique index message_section_map_key2 on message_section_map ( sid, mid );
CREATE INDEX message_rdtree_idx on message using gist ( sections ) with ( islossy );
VACUUM ANALYZE;
select count(*) from message;
select count(*) from message_section_map;
EOT
unlink 'message.tmp', 'message_section_map.tmp';
sub copytable {
my $t = shift;
print "COPY $t from stdin;\n";
open( FFF, "$t.tmp") || die;
while(<FFF>) { print; }
close FFF;
print "\\.\n";
}
This diff is collapsed.
--
-- first, define the datatype. Turn off echoing so that expected file
-- does not depend on contents of seg.sql.
--
\set ECHO none
CREATE TABLE test__int( a int[] );
\copy test__int from 'data/test__int.data'
SELECT count(*) from test__int WHERE a && '{23,50}';
count
-------
345
(1 row)
SELECT count(*) from test__int WHERE a @ '{23,50}';
count
-------
12
(1 row)
--
-- first, define the datatype. Turn off echoing so that expected file
-- does not depend on contents of seg.sql.
--
\set ECHO none
\i _int.sql
\set ECHO all
CREATE TABLE test__int( a int[] );
\copy test__int from 'data/test__int.data'
SELECT count(*) from test__int WHERE a && '{23,50}';
SELECT count(*) from test__int WHERE a @ '{23,50}';
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment