Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
Postgres FD Implementation
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Abuhujair Javed
Postgres FD Implementation
Commits
73d1040b
Commit
73d1040b
authored
May 27, 2001
by
Tom Lane
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fix eqjoinsel() to make use of new statistics.
parent
a001f135
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
201 additions
and
41 deletions
+201
-41
src/backend/utils/adt/selfuncs.c
src/backend/utils/adt/selfuncs.c
+201
-41
No files found.
src/backend/utils/adt/selfuncs.c
View file @
73d1040b
...
@@ -15,7 +15,7 @@
...
@@ -15,7 +15,7 @@
*
*
*
*
* IDENTIFICATION
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.9
0 2001/05/20 20:28:19
tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.9
1 2001/05/27 17:37:48
tgl Exp $
*
*
*-------------------------------------------------------------------------
*-------------------------------------------------------------------------
*/
*/
...
@@ -940,9 +940,7 @@ Datum
...
@@ -940,9 +940,7 @@ Datum
eqjoinsel
(
PG_FUNCTION_ARGS
)
eqjoinsel
(
PG_FUNCTION_ARGS
)
{
{
Query
*
root
=
(
Query
*
)
PG_GETARG_POINTER
(
0
);
Query
*
root
=
(
Query
*
)
PG_GETARG_POINTER
(
0
);
#ifdef NOT_USED
/* see neqjoinsel() before removing me! */
Oid
operator
=
PG_GETARG_OID
(
1
);
Oid
operator
=
PG_GETARG_OID
(
1
);
#endif
List
*
args
=
(
List
*
)
PG_GETARG_POINTER
(
2
);
List
*
args
=
(
List
*
)
PG_GETARG_POINTER
(
2
);
Var
*
var1
;
Var
*
var1
;
Var
*
var2
;
Var
*
var2
;
...
@@ -958,73 +956,219 @@ eqjoinsel(PG_FUNCTION_ARGS)
...
@@ -958,73 +956,219 @@ eqjoinsel(PG_FUNCTION_ARGS)
HeapTuple
statsTuple2
=
NULL
;
HeapTuple
statsTuple2
=
NULL
;
Form_pg_statistic
stats1
=
NULL
;
Form_pg_statistic
stats1
=
NULL
;
Form_pg_statistic
stats2
=
NULL
;
Form_pg_statistic
stats2
=
NULL
;
double
nd1
,
double
nd1
=
DEFAULT_NUM_DISTINCT
;
nd2
;
double
nd2
=
DEFAULT_NUM_DISTINCT
;
bool
have_mcvs1
=
false
;
if
(
var1
==
NULL
)
Datum
*
values1
=
NULL
;
{
int
nvalues1
=
0
;
nd1
=
DEFAULT_NUM_DISTINCT
;
float4
*
numbers1
=
NULL
;
}
int
nnumbers1
=
0
;
else
bool
have_mcvs2
=
false
;
Datum
*
values2
=
NULL
;
int
nvalues2
=
0
;
float4
*
numbers2
=
NULL
;
int
nnumbers2
=
0
;
if
(
var1
!=
NULL
)
{
{
/* get stats for the attribute, if available */
/* get stats for the attribute, if available */
Oid
relid1
=
getrelid
(
var1
->
varno
,
root
->
rtable
);
Oid
relid1
=
getrelid
(
var1
->
varno
,
root
->
rtable
);
if
(
relid1
==
InvalidOid
)
if
(
relid1
!=
InvalidOid
)
nd1
=
DEFAULT_NUM_DISTINCT
;
else
{
{
statsTuple1
=
SearchSysCache
(
STATRELATT
,
statsTuple1
=
SearchSysCache
(
STATRELATT
,
ObjectIdGetDatum
(
relid1
),
ObjectIdGetDatum
(
relid1
),
Int16GetDatum
(
var1
->
varattno
),
Int16GetDatum
(
var1
->
varattno
),
0
,
0
);
0
,
0
);
if
(
HeapTupleIsValid
(
statsTuple1
))
if
(
HeapTupleIsValid
(
statsTuple1
))
{
stats1
=
(
Form_pg_statistic
)
GETSTRUCT
(
statsTuple1
);
stats1
=
(
Form_pg_statistic
)
GETSTRUCT
(
statsTuple1
);
have_mcvs1
=
get_attstatsslot
(
statsTuple1
,
var1
->
vartype
,
var1
->
vartypmod
,
STATISTIC_KIND_MCV
,
InvalidOid
,
&
values1
,
&
nvalues1
,
&
numbers1
,
&
nnumbers1
);
}
nd1
=
get_att_numdistinct
(
root
,
var1
,
stats1
);
nd1
=
get_att_numdistinct
(
root
,
var1
,
stats1
);
}
}
}
}
if
(
var2
==
NULL
)
if
(
var2
!=
NULL
)
{
nd2
=
DEFAULT_NUM_DISTINCT
;
}
else
{
{
/* get stats for the attribute, if available */
/* get stats for the attribute, if available */
Oid
relid2
=
getrelid
(
var2
->
varno
,
root
->
rtable
);
Oid
relid2
=
getrelid
(
var2
->
varno
,
root
->
rtable
);
if
(
relid2
==
InvalidOid
)
if
(
relid2
!=
InvalidOid
)
nd2
=
DEFAULT_NUM_DISTINCT
;
else
{
{
statsTuple2
=
SearchSysCache
(
STATRELATT
,
statsTuple2
=
SearchSysCache
(
STATRELATT
,
ObjectIdGetDatum
(
relid2
),
ObjectIdGetDatum
(
relid2
),
Int16GetDatum
(
var2
->
varattno
),
Int16GetDatum
(
var2
->
varattno
),
0
,
0
);
0
,
0
);
if
(
HeapTupleIsValid
(
statsTuple2
))
if
(
HeapTupleIsValid
(
statsTuple2
))
{
stats2
=
(
Form_pg_statistic
)
GETSTRUCT
(
statsTuple2
);
stats2
=
(
Form_pg_statistic
)
GETSTRUCT
(
statsTuple2
);
have_mcvs2
=
get_attstatsslot
(
statsTuple2
,
var2
->
vartype
,
var2
->
vartypmod
,
STATISTIC_KIND_MCV
,
InvalidOid
,
&
values2
,
&
nvalues2
,
&
numbers2
,
&
nnumbers2
);
}
nd2
=
get_att_numdistinct
(
root
,
var2
,
stats2
);
nd2
=
get_att_numdistinct
(
root
,
var2
,
stats2
);
}
}
}
}
/*
if
(
have_mcvs1
&&
have_mcvs2
)
* Estimate the join selectivity as 1 / sqrt(nd1*nd2)
{
* (can we produce any theory for this)?
/*
*
* We have most-common-value lists for both relations. Run
* XXX possibility to do better: if both attributes have histograms
* through the lists to see which MCVs actually join to each
* then we could determine the exact join selectivity between the
* other with the given operator. This allows us to determine
* MCV sets, and only have to assume the join behavior of the non-MCV
* the exact join selectivity for the portion of the relations
* values. This could be a big win when the MCVs cover a large part
* represented by the MCV lists. We still have to estimate for
* of the population.
* the remaining population, but in a skewed distribution this
*
* gives us a big leg up in accuracy. For motivation see the
* XXX what about nulls?
* analysis in Y. Ioannidis and S. Christodoulakis, "On the
*/
* propagation of errors in the size of join results", Technical
selec
=
1
.
0
/
sqrt
(
nd1
*
nd2
);
* Report 1018, Computer Science Dept., University of Wisconsin,
if
(
selec
>
1
.
0
)
* Madison, March 1991 (available from ftp.cs.wisc.edu).
selec
=
1
.
0
;
*/
FmgrInfo
eqproc
;
bool
*
hasmatch1
;
bool
*
hasmatch2
;
double
matchprodfreq
,
matchfreq1
,
matchfreq2
,
unmatchfreq1
,
unmatchfreq2
,
otherfreq1
,
otherfreq2
,
totalsel1
,
totalsel2
;
int
i
,
nmatches
;
fmgr_info
(
get_opcode
(
operator
),
&
eqproc
);
hasmatch1
=
(
bool
*
)
palloc
(
nvalues1
*
sizeof
(
bool
));
memset
(
hasmatch1
,
0
,
nvalues1
*
sizeof
(
bool
));
hasmatch2
=
(
bool
*
)
palloc
(
nvalues2
*
sizeof
(
bool
));
memset
(
hasmatch2
,
0
,
nvalues2
*
sizeof
(
bool
));
/*
* Note we assume that each MCV will match at most one member of
* the other MCV list. If the operator isn't really equality,
* there could be multiple matches --- but we don't look for them,
* both for speed and because the math wouldn't add up...
*/
matchprodfreq
=
0
.
0
;
nmatches
=
0
;
for
(
i
=
0
;
i
<
nvalues1
;
i
++
)
{
int
j
;
for
(
j
=
0
;
j
<
nvalues2
;
j
++
)
{
if
(
hasmatch2
[
j
])
continue
;
if
(
DatumGetBool
(
FunctionCall2
(
&
eqproc
,
values1
[
i
],
values2
[
j
])))
{
hasmatch1
[
i
]
=
hasmatch2
[
j
]
=
true
;
matchprodfreq
+=
numbers1
[
i
]
*
numbers2
[
j
];
nmatches
++
;
break
;
}
}
}
/* Sum up frequencies of matched and unmatched MCVs */
matchfreq1
=
unmatchfreq1
=
0
.
0
;
for
(
i
=
0
;
i
<
nvalues1
;
i
++
)
{
if
(
hasmatch1
[
i
])
matchfreq1
+=
numbers1
[
i
];
else
unmatchfreq1
+=
numbers1
[
i
];
}
matchfreq2
=
unmatchfreq2
=
0
.
0
;
for
(
i
=
0
;
i
<
nvalues2
;
i
++
)
{
if
(
hasmatch2
[
i
])
matchfreq2
+=
numbers2
[
i
];
else
unmatchfreq2
+=
numbers2
[
i
];
}
pfree
(
hasmatch1
);
pfree
(
hasmatch2
);
/*
* Compute total frequency of non-null values that are not in
* the MCV lists.
*/
otherfreq1
=
1
.
0
-
stats1
->
stanullfrac
-
matchfreq1
-
unmatchfreq1
;
otherfreq2
=
1
.
0
-
stats2
->
stanullfrac
-
matchfreq2
-
unmatchfreq2
;
/*
* We can estimate the total selectivity from the point of view
* of relation 1 as: the known selectivity for matched MCVs, plus
* unmatched MCVs that are assumed to match against random members
* of relation 2's non-MCV population, plus non-MCV values that
* are assumed to match against random members of relation 2's
* unmatched MCVs plus non-MCV values.
*/
totalsel1
=
matchprodfreq
;
if
(
nd2
>
nvalues2
)
totalsel1
+=
unmatchfreq1
*
otherfreq2
/
(
nd2
-
nvalues2
);
if
(
nd2
>
nmatches
)
totalsel1
+=
otherfreq1
*
(
otherfreq2
+
unmatchfreq2
)
/
(
nd2
-
nmatches
);
/* Same estimate from the point of view of relation 2. */
totalsel2
=
matchprodfreq
;
if
(
nd1
>
nvalues1
)
totalsel2
+=
unmatchfreq2
*
otherfreq1
/
(
nd1
-
nvalues1
);
if
(
nd1
>
nmatches
)
totalsel2
+=
otherfreq2
*
(
otherfreq1
+
unmatchfreq1
)
/
(
nd1
-
nmatches
);
/*
* For robustness, we average the two estimates. (Can a case
* be made for taking the min or max instead?)
*/
selec
=
(
totalsel1
+
totalsel2
)
*
0
.
5
;
}
else
{
/*
* We do not have MCV lists for both sides. Estimate the
* join selectivity as MIN(1/nd1, 1/nd2). This is plausible
* if we assume that the values are about equally distributed:
* a given tuple of rel1 will join to either 0 or N2/nd2 rows
* of rel2, so total join rows are at most N1*N2/nd2 giving
* a join selectivity of not more than 1/nd2. By the same logic
* it is not more than 1/nd1, so MIN(1/nd1, 1/nd2) is an upper
* bound. Using the MIN() means we estimate from the point of
* view of the relation with smaller nd (since the larger nd is
* determining the MIN). It is reasonable to assume that most
* tuples in this rel will have join partners, so the bound is
* probably reasonably tight and should be taken as-is.
*
* XXX Can we be smarter if we have an MCV list for just one side?
* It seems that if we assume equal distribution for the other
* side, we end up with the same answer anyway.
*/
if
(
nd1
>
nd2
)
selec
=
1
.
0
/
nd1
;
else
selec
=
1
.
0
/
nd2
;
}
if
(
have_mcvs1
)
free_attstatsslot
(
var1
->
vartype
,
values1
,
nvalues1
,
numbers1
,
nnumbers1
);
if
(
have_mcvs2
)
free_attstatsslot
(
var2
->
vartype
,
values2
,
nvalues2
,
numbers2
,
nnumbers2
);
if
(
HeapTupleIsValid
(
statsTuple1
))
if
(
HeapTupleIsValid
(
statsTuple1
))
ReleaseSysCache
(
statsTuple1
);
ReleaseSysCache
(
statsTuple1
);
if
(
HeapTupleIsValid
(
statsTuple2
))
if
(
HeapTupleIsValid
(
statsTuple2
))
...
@@ -1039,14 +1183,30 @@ eqjoinsel(PG_FUNCTION_ARGS)
...
@@ -1039,14 +1183,30 @@ eqjoinsel(PG_FUNCTION_ARGS)
Datum
Datum
neqjoinsel
(
PG_FUNCTION_ARGS
)
neqjoinsel
(
PG_FUNCTION_ARGS
)
{
{
Query
*
root
=
(
Query
*
)
PG_GETARG_POINTER
(
0
);
Oid
operator
=
PG_GETARG_OID
(
1
);
List
*
args
=
(
List
*
)
PG_GETARG_POINTER
(
2
);
Oid
eqop
;
float8
result
;
float8
result
;
/*
/*
* XXX we skip looking up the negator operator here because we know
* We want 1 - eqjoinsel() where the equality operator is the one
* eqjoinsel() won't look at it anyway. If eqjoinsel() ever does
* associated with this != operator, that is, its negator.
* look, this routine will need to look more like neqsel() does.
*/
*/
result
=
DatumGetFloat8
(
eqjoinsel
(
fcinfo
));
eqop
=
get_negator
(
operator
);
if
(
eqop
)
{
result
=
DatumGetFloat8
(
DirectFunctionCall3
(
eqjoinsel
,
PointerGetDatum
(
root
),
ObjectIdGetDatum
(
eqop
),
PointerGetDatum
(
args
)));
}
else
{
/* Use default selectivity (should we raise an error instead?) */
result
=
DEFAULT_EQ_SEL
;
}
result
=
1
.
0
-
result
;
result
=
1
.
0
-
result
;
PG_RETURN_FLOAT8
(
result
);
PG_RETURN_FLOAT8
(
result
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment