Commit 312b1a98 authored by Tom Lane's avatar Tom Lane

Reduce the memory footprint of large pending-trigger-event lists, as per my

recent proposal.  In typical cases, we now need 12 bytes per insert or delete
event and 16 bytes per update event; previously we needed 40 bytes per
event on 32-bit hardware and 80 bytes per event on 64-bit hardware.  Even
in the worst case usage pattern with a large number of distinct triggers being
fired in one query, usage is at most 32 bytes per event.  It seems to be a
bit faster than the old code as well, due to reduction of palloc overhead.

This commit doesn't address the TODO item of allowing the event list to spill
to disk; rather it's trying to stave off the need for that.  However, it
probably makes that task a bit easier by reducing the data structure's
dependency on pointers.  It would now be practical to dump an event list to
disk by "chunks" instead of individual events.
parent 3ca5aa6c
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/commands/trigger.c,v 1.237 2008/09/01 20:42:44 tgl Exp $
* $PostgreSQL: pgsql/src/backend/commands/trigger.c,v 1.238 2008/10/24 23:42:35 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -2234,12 +2234,14 @@ ltrmark:;
* during the current transaction tree. (BEFORE triggers are fired
* immediately so we don't need any persistent state about them.) The struct
* and most of its subsidiary data are kept in TopTransactionContext; however
* the individual event records are kept in separate contexts, to make them
* easy to delete during subtransaction abort.
* the individual event records are kept in a separate sub-context. This is
* done mainly so that it's easy to tell from a memory context dump how much
* space is being eaten by trigger events.
*
* Because the list of pending events can grow large, we go to some effort
* to minimize memory consumption. We do not use the generic List mechanism
* but thread the events manually.
* Because the list of pending events can grow large, we go to some
* considerable effort to minimize per-event memory consumption. The event
* records are grouped into chunks and common data for similar events in the
* same chunk is only stored once.
*
* XXX We need to be able to save the per-event data in a file if it grows too
* large.
......@@ -2280,30 +2282,101 @@ typedef SetConstraintStateData *SetConstraintState;
/*
* Per-trigger-event data
*
* Note: ate_firing_id is meaningful when either AFTER_TRIGGER_DONE
* or AFTER_TRIGGER_IN_PROGRESS is set. It indicates which trigger firing
* cycle the trigger was or will be fired in.
* The actual per-event data, AfterTriggerEventData, includes DONE/IN_PROGRESS
* status bits and one or two tuple CTIDs. Each event record also has an
* associated AfterTriggerSharedData that is shared across all instances
* of similar events within a "chunk".
*
* We arrange not to waste storage on ate_ctid2 for non-update events.
* We could go further and not store either ctid for statement-level triggers,
* but that seems unlikely to be worth the trouble.
*
* Note: ats_firing_id is initially zero and is set to something else when
* AFTER_TRIGGER_IN_PROGRESS is set. It indicates which trigger firing
* cycle the trigger will be fired in (or was fired in, if DONE is set).
* Although this is mutable state, we can keep it in AfterTriggerSharedData
* because all instances of the same type of event in a given event list will
* be fired at the same time, if they were queued between the same firing
* cycles. So we need only ensure that ats_firing_id is zero when attaching
* a new event to an existing AfterTriggerSharedData record.
*/
typedef uint32 TriggerFlags;
#define AFTER_TRIGGER_OFFSET 0x0FFFFFFF /* must be low-order bits */
#define AFTER_TRIGGER_2CTIDS 0x10000000
#define AFTER_TRIGGER_DONE 0x20000000
#define AFTER_TRIGGER_IN_PROGRESS 0x40000000
typedef struct AfterTriggerSharedData *AfterTriggerShared;
typedef struct AfterTriggerSharedData
{
TriggerEvent ats_event; /* event type indicator, see trigger.h */
Oid ats_tgoid; /* the trigger's ID */
Oid ats_relid; /* the relation it's on */
CommandId ats_firing_id; /* ID for firing cycle */
} AfterTriggerSharedData;
typedef struct AfterTriggerEventData *AfterTriggerEvent;
typedef struct AfterTriggerEventData
{
AfterTriggerEvent ate_next; /* list link */
TriggerEvent ate_event; /* event type and status bits */
CommandId ate_firing_id; /* ID for firing cycle */
Oid ate_tgoid; /* the trigger's ID */
Oid ate_relid; /* the relation it's on */
ItemPointerData ate_oldctid; /* specific tuple(s) involved */
ItemPointerData ate_newctid;
TriggerFlags ate_flags; /* status bits and offset to shared data */
ItemPointerData ate_ctid1; /* inserted, deleted, or old updated tuple */
ItemPointerData ate_ctid2; /* new updated tuple */
} AfterTriggerEventData;
/* This struct must exactly match the one above except for not having ctid2 */
typedef struct AfterTriggerEventDataOneCtid
{
TriggerFlags ate_flags; /* status bits and offset to shared data */
ItemPointerData ate_ctid1; /* inserted, deleted, or old updated tuple */
} AfterTriggerEventDataOneCtid;
#define SizeofTriggerEvent(evt) \
(((evt)->ate_flags & AFTER_TRIGGER_2CTIDS) ? \
sizeof(AfterTriggerEventData) : sizeof(AfterTriggerEventDataOneCtid))
#define GetTriggerSharedData(evt) \
((AfterTriggerShared) ((char *) (evt) + ((evt)->ate_flags & AFTER_TRIGGER_OFFSET)))
/*
* To avoid palloc overhead, we keep trigger events in arrays in successively-
* larger chunks (a slightly more sophisticated version of an expansible
* array). The space between CHUNK_DATA_START and freeptr is occupied by
* AfterTriggerEventData records; the space between endfree and endptr is
* occupied by AfterTriggerSharedData records.
*/
typedef struct AfterTriggerEventChunk
{
struct AfterTriggerEventChunk *next; /* list link */
char *freeptr; /* start of free space in chunk */
char *endfree; /* end of free space in chunk */
char *endptr; /* end of chunk */
/* event data follows here */
} AfterTriggerEventChunk;
#define CHUNK_DATA_START(cptr) ((char *) (cptr) + MAXALIGN(sizeof(AfterTriggerEventChunk)))
/* A list of events */
typedef struct AfterTriggerEventList
{
AfterTriggerEvent head;
AfterTriggerEvent tail;
AfterTriggerEventChunk *head;
AfterTriggerEventChunk *tail;
char *tailfree; /* freeptr of tail chunk */
} AfterTriggerEventList;
/* Macros to help in iterating over a list of events */
#define for_each_chunk(cptr, evtlist) \
for (cptr = (evtlist).head; cptr != NULL; cptr = cptr->next)
#define for_each_event(eptr, cptr) \
for (eptr = (AfterTriggerEvent) CHUNK_DATA_START(cptr); \
(char *) eptr < (cptr)->freeptr; \
eptr = (AfterTriggerEvent) (((char *) eptr) + SizeofTriggerEvent(eptr)))
/* Use this if no special per-chunk processing is needed */
#define for_each_event_chunk(eptr, cptr, evtlist) \
for_each_chunk(cptr, evtlist) for_each_event(eptr, cptr)
/*
* All per-transaction data for the AFTER TRIGGERS module.
......@@ -2323,9 +2396,7 @@ typedef struct AfterTriggerEventList
* all subtransactions of the current transaction. In a subtransaction
* abort, we know that the events added by the subtransaction are at the
* end of the list, so it is relatively easy to discard them. The event
* structs themselves are stored in event_cxt if generated by the top-level
* transaction, else in per-subtransaction contexts identified by the
* entries in cxt_stack.
* list chunks themselves are stored in event_cxt.
*
* query_depth is the current depth of nested AfterTriggerBeginQuery calls
* (-1 when the stack is empty).
......@@ -2367,8 +2438,7 @@ typedef struct AfterTriggersData
int query_depth; /* current query list index */
AfterTriggerEventList *query_stack; /* events pending from each query */
int maxquerydepth; /* allocated len of above array */
MemoryContext event_cxt; /* top transaction's event context, if any */
MemoryContext *cxt_stack; /* per-subtransaction event contexts */
MemoryContext event_cxt; /* memory context for events, if any */
/* these fields are just for resetting at subtrans abort: */
......@@ -2398,13 +2468,13 @@ static SetConstraintState SetConstraintStateAddItem(SetConstraintState state,
/* ----------
* afterTriggerCheckState()
*
* Returns true if the trigger identified by tgoid is actually
* in state DEFERRED.
* Returns true if the trigger event is actually in state DEFERRED.
* ----------
*/
static bool
afterTriggerCheckState(Oid tgoid, TriggerEvent eventstate)
afterTriggerCheckState(AfterTriggerShared evtshared)
{
Oid tgoid = evtshared->ats_tgoid;
SetConstraintState state = afterTriggers->state;
int i;
......@@ -2412,7 +2482,7 @@ afterTriggerCheckState(Oid tgoid, TriggerEvent eventstate)
* For not-deferrable triggers (i.e. normal AFTER ROW triggers and
* constraints declared NOT DEFERRABLE), the state is always false.
*/
if ((eventstate & AFTER_TRIGGER_DEFERRABLE) == 0)
if ((evtshared->ats_event & AFTER_TRIGGER_DEFERRABLE) == 0)
return false;
/*
......@@ -2433,37 +2503,184 @@ afterTriggerCheckState(Oid tgoid, TriggerEvent eventstate)
/*
* Otherwise return the default state for the trigger.
*/
return ((eventstate & AFTER_TRIGGER_INITDEFERRED) != 0);
return ((evtshared->ats_event & AFTER_TRIGGER_INITDEFERRED) != 0);
}
/* ----------
* afterTriggerAddEvent()
*
* Add a new trigger event to the current query's queue.
* Add a new trigger event to the specified queue.
* The passed-in event data is copied.
* ----------
*/
static void
afterTriggerAddEvent(AfterTriggerEvent event)
afterTriggerAddEvent(AfterTriggerEventList *events,
AfterTriggerEvent event, AfterTriggerShared evtshared)
{
AfterTriggerEventList *events;
Size eventsize = SizeofTriggerEvent(event);
Size needed = eventsize + sizeof(AfterTriggerSharedData);
AfterTriggerEventChunk *chunk;
AfterTriggerShared newshared;
AfterTriggerEvent newevent;
Assert(event->ate_next == NULL);
/*
* If empty list or not enough room in the tail chunk, make a new chunk.
* We assume here that a new shared record will always be needed.
*/
chunk = events->tail;
if (chunk == NULL ||
chunk->endfree - chunk->freeptr < needed)
{
Size chunksize;
/* Must be inside a query */
Assert(afterTriggers->query_depth >= 0);
/* Create event context if we didn't already */
if (afterTriggers->event_cxt == NULL)
afterTriggers->event_cxt =
AllocSetContextCreate(TopTransactionContext,
"AfterTriggerEvents",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
events = &afterTriggers->query_stack[afterTriggers->query_depth];
if (events->tail == NULL)
/*
* Chunk size starts at 1KB and is allowed to increase up to 1MB.
* These numbers are fairly arbitrary, though there is a hard limit at
* AFTER_TRIGGER_OFFSET; else we couldn't link event records to their
* shared records using the available space in ate_flags. Another
* constraint is that if the chunk size gets too huge, the search loop
* below would get slow given a (not too common) usage pattern with
* many distinct event types in a chunk. Therefore, we double the
* preceding chunk size only if there weren't too many shared records
* in the preceding chunk; otherwise we halve it. This gives us some
* ability to adapt to the actual usage pattern of the current query
* while still having large chunk sizes in typical usage. All chunk
* sizes used should be MAXALIGN multiples, to ensure that the shared
* records will be aligned safely.
*/
#define MIN_CHUNK_SIZE 1024
#define MAX_CHUNK_SIZE (1024*1024)
#if MAX_CHUNK_SIZE > (AFTER_TRIGGER_OFFSET+1)
#error MAX_CHUNK_SIZE must not exceed AFTER_TRIGGER_OFFSET
#endif
if (chunk == NULL)
chunksize = MIN_CHUNK_SIZE;
else
{
/* first list entry */
events->head = event;
events->tail = event;
/* preceding chunk size... */
chunksize = chunk->endptr - (char *) chunk;
/* check number of shared records in preceding chunk */
if ((chunk->endptr - chunk->endfree) <=
(100 * sizeof(AfterTriggerSharedData)))
chunksize *= 2; /* okay, double it */
else
chunksize /= 2; /* too many shared records */
chunksize = Min(chunksize, MAX_CHUNK_SIZE);
}
chunk = MemoryContextAlloc(afterTriggers->event_cxt, chunksize);
chunk->next = NULL;
chunk->freeptr = CHUNK_DATA_START(chunk);
chunk->endptr = chunk->endfree = (char *) chunk + chunksize;
Assert(chunk->endfree - chunk->freeptr >= needed);
if (events->head == NULL)
events->head = chunk;
else
events->tail->next = chunk;
events->tail = chunk;
}
/*
* Try to locate a matching shared-data record already in the chunk.
* If none, make a new one.
*/
for (newshared = ((AfterTriggerShared) chunk->endptr) - 1;
(char *) newshared >= chunk->endfree;
newshared--)
{
events->tail->ate_next = event;
events->tail = event;
if (newshared->ats_tgoid == evtshared->ats_tgoid &&
newshared->ats_relid == evtshared->ats_relid &&
newshared->ats_event == evtshared->ats_event &&
newshared->ats_firing_id == 0)
break;
}
if ((char *) newshared < chunk->endfree)
{
*newshared = *evtshared;
newshared->ats_firing_id = 0; /* just to be sure */
chunk->endfree = (char *) newshared;
}
/* Insert the data */
newevent = (AfterTriggerEvent) chunk->freeptr;
memcpy(newevent, event, eventsize);
/* ... and link the new event to its shared record */
newevent->ate_flags &= ~AFTER_TRIGGER_OFFSET;
newevent->ate_flags |= (char *) newshared - (char *) newevent;
chunk->freeptr += eventsize;
events->tailfree = chunk->freeptr;
}
/* ----------
* afterTriggerFreeEventList()
*
* Free all the event storage in the given list.
* ----------
*/
static void
afterTriggerFreeEventList(AfterTriggerEventList *events)
{
AfterTriggerEventChunk *chunk;
AfterTriggerEventChunk *next_chunk;
for (chunk = events->head; chunk != NULL; chunk = next_chunk)
{
next_chunk = chunk->next;
pfree(chunk);
}
events->head = NULL;
events->tail = NULL;
events->tailfree = NULL;
}
/* ----------
* afterTriggerRestoreEventList()
*
* Restore an event list to its prior length, removing all the events
* added since it had the value old_events.
* ----------
*/
static void
afterTriggerRestoreEventList(AfterTriggerEventList *events,
const AfterTriggerEventList *old_events)
{
AfterTriggerEventChunk *chunk;
AfterTriggerEventChunk *next_chunk;
if (old_events->tail == NULL)
{
/* restoring to a completely empty state, so free everything */
afterTriggerFreeEventList(events);
}
else
{
*events = *old_events;
/* free any chunks after the last one we want to keep */
for (chunk = events->tail->next; chunk != NULL; chunk = next_chunk)
{
next_chunk = chunk->next;
pfree(chunk);
}
/* and clean up the tail chunk to be the right length */
events->tail->next = NULL;
events->tail->freeptr = events->tailfree;
/*
* We don't make any effort to remove now-unused shared data records.
* They might still be useful, anyway.
*/
}
}
......@@ -2494,13 +2711,14 @@ AfterTriggerExecute(AfterTriggerEvent event,
FmgrInfo *finfo, Instrumentation *instr,
MemoryContext per_tuple_context)
{
Oid tgoid = event->ate_tgoid;
AfterTriggerShared evtshared = GetTriggerSharedData(event);
Oid tgoid = evtshared->ats_tgoid;
TriggerData LocTriggerData;
HeapTupleData oldtuple;
HeapTupleData newtuple;
HeapTupleData tuple1;
HeapTupleData tuple2;
HeapTuple rettuple;
Buffer oldbuffer = InvalidBuffer;
Buffer newbuffer = InvalidBuffer;
Buffer buffer1 = InvalidBuffer;
Buffer buffer2 = InvalidBuffer;
int tgindx;
/*
......@@ -2526,37 +2744,36 @@ AfterTriggerExecute(AfterTriggerEvent event,
InstrStartNode(instr + tgindx);
/*
* Fetch the required OLD and NEW tuples.
* Fetch the required tuple(s).
*/
if (ItemPointerIsValid(&(event->ate_ctid1)))
{
ItemPointerCopy(&(event->ate_ctid1), &(tuple1.t_self));
if (!heap_fetch(rel, SnapshotAny, &tuple1, &buffer1, false, NULL))
elog(ERROR, "failed to fetch tuple1 for AFTER trigger");
LocTriggerData.tg_trigtuple = &tuple1;
LocTriggerData.tg_trigtuplebuf = buffer1;
}
else
{
LocTriggerData.tg_trigtuple = NULL;
LocTriggerData.tg_newtuple = NULL;
LocTriggerData.tg_trigtuplebuf = InvalidBuffer;
LocTriggerData.tg_newtuplebuf = InvalidBuffer;
if (ItemPointerIsValid(&(event->ate_oldctid)))
{
ItemPointerCopy(&(event->ate_oldctid), &(oldtuple.t_self));
if (!heap_fetch(rel, SnapshotAny, &oldtuple, &oldbuffer, false, NULL))
elog(ERROR, "failed to fetch old tuple for AFTER trigger");
LocTriggerData.tg_trigtuple = &oldtuple;
LocTriggerData.tg_trigtuplebuf = oldbuffer;
}
if (ItemPointerIsValid(&(event->ate_newctid)))
{
ItemPointerCopy(&(event->ate_newctid), &(newtuple.t_self));
if (!heap_fetch(rel, SnapshotAny, &newtuple, &newbuffer, false, NULL))
elog(ERROR, "failed to fetch new tuple for AFTER trigger");
if (LocTriggerData.tg_trigtuple != NULL)
/* don't touch ctid2 if not there */
if ((event->ate_flags & AFTER_TRIGGER_2CTIDS) &&
ItemPointerIsValid(&(event->ate_ctid2)))
{
LocTriggerData.tg_newtuple = &newtuple;
LocTriggerData.tg_newtuplebuf = newbuffer;
ItemPointerCopy(&(event->ate_ctid2), &(tuple2.t_self));
if (!heap_fetch(rel, SnapshotAny, &tuple2, &buffer2, false, NULL))
elog(ERROR, "failed to fetch tuple2 for AFTER trigger");
LocTriggerData.tg_newtuple = &tuple2;
LocTriggerData.tg_newtuplebuf = buffer2;
}
else
{
LocTriggerData.tg_trigtuple = &newtuple;
LocTriggerData.tg_trigtuplebuf = newbuffer;
}
LocTriggerData.tg_newtuple = NULL;
LocTriggerData.tg_newtuplebuf = InvalidBuffer;
}
/*
......@@ -2564,7 +2781,7 @@ AfterTriggerExecute(AfterTriggerEvent event,
*/
LocTriggerData.type = T_TriggerData;
LocTriggerData.tg_event =
event->ate_event & (TRIGGER_EVENT_OPMASK | TRIGGER_EVENT_ROW);
evtshared->ats_event & (TRIGGER_EVENT_OPMASK | TRIGGER_EVENT_ROW);
LocTriggerData.tg_relation = rel;
MemoryContextReset(per_tuple_context);
......@@ -2578,16 +2795,16 @@ AfterTriggerExecute(AfterTriggerEvent event,
finfo,
NULL,
per_tuple_context);
if (rettuple != NULL && rettuple != &oldtuple && rettuple != &newtuple)
if (rettuple != NULL && rettuple != &tuple1 && rettuple != &tuple2)
heap_freetuple(rettuple);
/*
* Release buffers
*/
if (oldbuffer != InvalidBuffer)
ReleaseBuffer(oldbuffer);
if (newbuffer != InvalidBuffer)
ReleaseBuffer(newbuffer);
if (buffer1 != InvalidBuffer)
ReleaseBuffer(buffer1);
if (buffer2 != InvalidBuffer)
ReleaseBuffer(buffer2);
/*
* If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count
......@@ -2605,7 +2822,7 @@ AfterTriggerExecute(AfterTriggerEvent event,
* that can be invoked now with the current firing ID.
*
* If move_list isn't NULL, events that are not to be invoked now are
* removed from the given list and appended to move_list.
* transferred to move_list.
*
* When immediate_only is TRUE, do not invoke currently-deferred triggers.
* (This will be FALSE only at main transaction exit.)
......@@ -2618,26 +2835,22 @@ afterTriggerMarkEvents(AfterTriggerEventList *events,
bool immediate_only)
{
bool found = false;
AfterTriggerEvent event,
prev_event;
prev_event = NULL;
event = events->head;
AfterTriggerEvent event;
AfterTriggerEventChunk *chunk;
while (event != NULL)
for_each_event_chunk(event, chunk, *events)
{
AfterTriggerShared evtshared = GetTriggerSharedData(event);
bool defer_it = false;
AfterTriggerEvent next_event;
if (!(event->ate_event &
if (!(event->ate_flags &
(AFTER_TRIGGER_DONE | AFTER_TRIGGER_IN_PROGRESS)))
{
/*
* This trigger hasn't been called or scheduled yet. Check if we
* should call it now.
*/
if (immediate_only &&
afterTriggerCheckState(event->ate_tgoid, event->ate_event))
if (immediate_only && afterTriggerCheckState(evtshared))
{
defer_it = true;
}
......@@ -2646,8 +2859,8 @@ afterTriggerMarkEvents(AfterTriggerEventList *events,
/*
* Mark it as to be fired in this firing cycle.
*/
event->ate_firing_id = afterTriggers->firing_counter;
event->ate_event |= AFTER_TRIGGER_IN_PROGRESS;
evtshared->ats_firing_id = afterTriggers->firing_counter;
event->ate_flags |= AFTER_TRIGGER_IN_PROGRESS;
found = true;
}
}
......@@ -2655,45 +2868,19 @@ afterTriggerMarkEvents(AfterTriggerEventList *events,
/*
* If it's deferred, move it to move_list, if requested.
*/
next_event = event->ate_next;
if (defer_it && move_list != NULL)
{
/* Delink it from input list */
if (prev_event)
prev_event->ate_next = next_event;
else
events->head = next_event;
/* and add it to move_list */
event->ate_next = NULL;
if (move_list->tail == NULL)
{
/* first list entry */
move_list->head = event;
move_list->tail = event;
}
else
{
move_list->tail->ate_next = event;
move_list->tail = event;
}
}
else
{
/* Keep it in input list */
prev_event = event;
/* add it to move_list */
afterTriggerAddEvent(move_list, event, evtshared);
/* mark original copy "done" so we don't do it again */
event->ate_flags |= AFTER_TRIGGER_DONE;
}
event = next_event;
}
/* Update list tail pointer in case we moved tail event */
events->tail = prev_event;
return found;
}
/* ----------
/*
* afterTriggerInvokeEvents()
*
* Scan the given event list for events that are marked as to be fired
......@@ -2704,18 +2891,24 @@ afterTriggerMarkEvents(AfterTriggerEventList *events,
* make one locally to cache the info in case there are multiple trigger
* events per rel.
*
* When delete_ok is TRUE, it's okay to delete fully-processed events.
* The events list pointers are updated.
* ----------
* When delete_ok is TRUE, it's safe to delete fully-processed events.
* (We are not very tense about that: we simply reset a chunk to be empty
* if all its events got fired. The objective here is just to avoid useless
* rescanning of events when a trigger queues new events during transaction
* end, so it's not necessary to worry much about the case where only
* some events are fired.)
*
* Returns TRUE if no unfired events remain in the list (this allows us
* to avoid repeating afterTriggerMarkEvents).
*/
static void
static bool
afterTriggerInvokeEvents(AfterTriggerEventList *events,
CommandId firing_id,
EState *estate,
bool delete_ok)
{
AfterTriggerEvent event,
prev_event;
bool all_fired = true;
AfterTriggerEventChunk *chunk;
MemoryContext per_tuple_context;
bool local_estate = false;
Relation rel = NULL;
......@@ -2738,41 +2931,43 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
prev_event = NULL;
event = events->head;
for_each_chunk(chunk, *events)
{
AfterTriggerEvent event;
bool all_fired_in_chunk = true;
while (event != NULL)
for_each_event(event, chunk)
{
AfterTriggerEvent next_event;
AfterTriggerShared evtshared = GetTriggerSharedData(event);
/*
* Is it one for me to fire?
*/
if ((event->ate_event & AFTER_TRIGGER_IN_PROGRESS) &&
event->ate_firing_id == firing_id)
if ((event->ate_flags & AFTER_TRIGGER_IN_PROGRESS) &&
evtshared->ats_firing_id == firing_id)
{
/*
* So let's fire it... but first, find the correct relation if
* this is not the same relation as before.
*/
if (rel == NULL || RelationGetRelid(rel) != event->ate_relid)
if (rel == NULL || RelationGetRelid(rel) != evtshared->ats_relid)
{
ResultRelInfo *rInfo;
rInfo = ExecGetTriggerResultRel(estate, event->ate_relid);
rInfo = ExecGetTriggerResultRel(estate, evtshared->ats_relid);
rel = rInfo->ri_RelationDesc;
trigdesc = rInfo->ri_TrigDesc;
finfo = rInfo->ri_TrigFunctions;
instr = rInfo->ri_TrigInstrument;
if (trigdesc == NULL) /* should not happen */
elog(ERROR, "relation %u has no triggers",
event->ate_relid);
evtshared->ats_relid);
}
/*
* Fire it. Note that the AFTER_TRIGGER_IN_PROGRESS flag is still
* set, so recursive examinations of the event list won't try to
* re-fire it.
* Fire it. Note that the AFTER_TRIGGER_IN_PROGRESS flag is
* still set, so recursive examinations of the event list
* won't try to re-fire it.
*/
AfterTriggerExecute(event, rel, trigdesc, finfo, instr,
per_tuple_context);
......@@ -2780,40 +2975,23 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events,
/*
* Mark the event as done.
*/
event->ate_event &= ~AFTER_TRIGGER_IN_PROGRESS;
event->ate_event |= AFTER_TRIGGER_DONE;
event->ate_flags &= ~AFTER_TRIGGER_IN_PROGRESS;
event->ate_flags |= AFTER_TRIGGER_DONE;
}
/*
* If it's now done, throw it away, if allowed.
*
* NB: it's possible the trigger call above added more events to the
* queue, or that calls we will do later will want to add more, so we
* have to be careful about maintaining list validity at all points
* here.
*/
next_event = event->ate_next;
if ((event->ate_event & AFTER_TRIGGER_DONE) && delete_ok)
else if (!(event->ate_flags & AFTER_TRIGGER_DONE))
{
/* Delink it from list and free it */
if (prev_event)
prev_event->ate_next = next_event;
else
events->head = next_event;
pfree(event);
/* something remains to be done */
all_fired = all_fired_in_chunk = false;
}
else
{
/* Keep it in list */
prev_event = event;
}
event = next_event;
/* Clear the chunk if delete_ok and nothing left of interest */
if (delete_ok && all_fired_in_chunk)
{
chunk->freeptr = CHUNK_DATA_START(chunk);
chunk->endfree = chunk->endptr;
}
}
/* Update list tail pointer in case we just deleted tail event */
events->tail = prev_event;
/* Release working resources */
MemoryContextDelete(per_tuple_context);
......@@ -2832,6 +3010,8 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events,
}
FreeExecutorState(estate);
}
return all_fired;
}
......@@ -2854,10 +3034,11 @@ AfterTriggerBeginXact(void)
MemoryContextAlloc(TopTransactionContext,
sizeof(AfterTriggersData));
afterTriggers->firing_counter = FirstCommandId;
afterTriggers->firing_counter = (CommandId) 1; /* mustn't be 0 */
afterTriggers->state = SetConstraintStateCreate(8);
afterTriggers->events.head = NULL;
afterTriggers->events.tail = NULL;
afterTriggers->events.tailfree = NULL;
afterTriggers->query_depth = -1;
/* We initialize the query stack to a reasonable size */
......@@ -2870,7 +3051,6 @@ AfterTriggerBeginXact(void)
afterTriggers->event_cxt = NULL;
/* Subtransaction stack is empty until/unless needed */
afterTriggers->cxt_stack = NULL;
afterTriggers->state_stack = NULL;
afterTriggers->events_stack = NULL;
afterTriggers->depth_stack = NULL;
......@@ -2891,6 +3071,8 @@ AfterTriggerBeginXact(void)
void
AfterTriggerBeginQuery(void)
{
AfterTriggerEventList *events;
/* Must be inside a transaction */
Assert(afterTriggers != NULL);
......@@ -2912,8 +3094,10 @@ AfterTriggerBeginQuery(void)
}
/* Initialize this query's list to empty */
afterTriggers->query_stack[afterTriggers->query_depth].head = NULL;
afterTriggers->query_stack[afterTriggers->query_depth].tail = NULL;
events = &afterTriggers->query_stack[afterTriggers->query_depth];
events->head = NULL;
events->tail = NULL;
events->tailfree = NULL;
}
......@@ -2950,20 +3134,32 @@ AfterTriggerEndQuery(EState *estate)
* IMMEDIATE: all events we have decided to defer will be available for it
* to fire.
*
* We loop in case a trigger queues more events.
* We loop in case a trigger queues more events at the same query level
* (is that even possible?). Be careful here: firing a trigger could
* result in query_stack being repalloc'd, so we can't save its address
* across afterTriggerInvokeEvents calls.
*
* If we find no firable events, we don't have to increment
* firing_counter.
*/
for (;;)
{
events = &afterTriggers->query_stack[afterTriggers->query_depth];
while (afterTriggerMarkEvents(events, &afterTriggers->events, true))
if (afterTriggerMarkEvents(events, &afterTriggers->events, true))
{
CommandId firing_id = afterTriggers->firing_counter++;
/* OK to delete the immediate events after processing them */
afterTriggerInvokeEvents(events, firing_id, estate, true);
if (afterTriggerInvokeEvents(events, firing_id, estate, true))
break; /* all fired */
}
else
break;
}
/* Release query-local storage for events */
afterTriggerFreeEventList(&afterTriggers->query_stack[afterTriggers->query_depth]);
afterTriggers->query_depth--;
}
......@@ -3011,13 +3207,17 @@ AfterTriggerFireDeferred(void)
{
CommandId firing_id = afterTriggers->firing_counter++;
afterTriggerInvokeEvents(events, firing_id, NULL, true);
if (afterTriggerInvokeEvents(events, firing_id, NULL, true))
break; /* all fired */
}
/*
* We don't bother freeing the event list, since it will go away anyway
* (and more efficiently than via pfree) in AfterTriggerEndXact.
*/
if (snap_pushed)
PopActiveSnapshot();
Assert(events->head == NULL);
}
......@@ -3045,10 +3245,6 @@ AfterTriggerEndXact(bool isCommit)
* pending-events list could be large, and so it's useful to discard it as
* soon as possible --- especially if we are aborting because we ran out
* of memory for the list!
*
* (Note: any event_cxts of child subtransactions could also be deleted
* here, but we have no convenient way to find them, so we leave it to
* TopTransactionContext reset to clean them up.)
*/
if (afterTriggers && afterTriggers->event_cxt)
MemoryContextDelete(afterTriggers->event_cxt);
......@@ -3087,8 +3283,6 @@ AfterTriggerBeginSubXact(void)
old_cxt = MemoryContextSwitchTo(TopTransactionContext);
#define DEFTRIG_INITALLOC 8
afterTriggers->cxt_stack = (MemoryContext *)
palloc(DEFTRIG_INITALLOC * sizeof(MemoryContext));
afterTriggers->state_stack = (SetConstraintState *)
palloc(DEFTRIG_INITALLOC * sizeof(SetConstraintState));
afterTriggers->events_stack = (AfterTriggerEventList *)
......@@ -3106,9 +3300,6 @@ AfterTriggerBeginSubXact(void)
/* repalloc will keep the stacks in the same context */
int new_alloc = afterTriggers->maxtransdepth * 2;
afterTriggers->cxt_stack = (MemoryContext *)
repalloc(afterTriggers->cxt_stack,
new_alloc * sizeof(MemoryContext));
afterTriggers->state_stack = (SetConstraintState *)
repalloc(afterTriggers->state_stack,
new_alloc * sizeof(SetConstraintState));
......@@ -3130,7 +3321,6 @@ AfterTriggerBeginSubXact(void)
* is not saved until/unless changed. Likewise, we don't make a
* per-subtransaction event context until needed.
*/
afterTriggers->cxt_stack[my_level] = NULL;
afterTriggers->state_stack[my_level] = NULL;
afterTriggers->events_stack[my_level] = afterTriggers->events;
afterTriggers->depth_stack[my_level] = afterTriggers->query_depth;
......@@ -3148,6 +3338,7 @@ AfterTriggerEndSubXact(bool isCommit)
int my_level = GetCurrentTransactionNestLevel();
SetConstraintState state;
AfterTriggerEvent event;
AfterTriggerEventChunk *chunk;
CommandId subxact_firing_id;
/*
......@@ -3172,55 +3363,27 @@ AfterTriggerEndSubXact(bool isCommit)
afterTriggers->state_stack[my_level] = NULL;
Assert(afterTriggers->query_depth ==
afterTriggers->depth_stack[my_level]);
/*
* It's entirely possible that the subxact created an event_cxt but
* there is not anything left in it (because all the triggers were
* fired at end-of-statement). If so, we should release the context
* to prevent memory leakage in a long sequence of subtransactions. We
* can detect whether there's anything of use in the context by seeing
* if anything was added to the global events list since subxact
* start. (This test doesn't catch every case where the context is
* deletable; for instance maybe the only additions were from a
* sub-sub-xact. But it handles the common case.)
*/
if (afterTriggers->cxt_stack[my_level] &&
afterTriggers->events.tail == afterTriggers->events_stack[my_level].tail)
{
MemoryContextDelete(afterTriggers->cxt_stack[my_level]);
/* avoid double delete if abort later */
afterTriggers->cxt_stack[my_level] = NULL;
}
}
else
{
/*
* Aborting. We don't really need to release the subxact's event_cxt,
* since it will go away anyway when CurTransactionContext gets reset,
* but doing so early in subxact abort helps free space we might need.
*
* (Note: any event_cxts of child subtransactions could also be
* deleted here, but we have no convenient way to find them, so we
* leave it to CurTransactionContext reset to clean them up.)
* Aborting. Release any event lists from queries being aborted,
* and restore query_depth to its pre-subxact value.
*/
if (afterTriggers->cxt_stack[my_level])
while (afterTriggers->query_depth > afterTriggers->depth_stack[my_level])
{
MemoryContextDelete(afterTriggers->cxt_stack[my_level]);
/* avoid double delete if repeated aborts */
afterTriggers->cxt_stack[my_level] = NULL;
afterTriggerFreeEventList(&afterTriggers->query_stack[afterTriggers->query_depth]);
afterTriggers->query_depth--;
}
Assert(afterTriggers->query_depth ==
afterTriggers->depth_stack[my_level]);
/*
* Restore the pointers from the stacks.
*/
afterTriggers->events = afterTriggers->events_stack[my_level];
afterTriggers->query_depth = afterTriggers->depth_stack[my_level];
/*
* Cleanup the tail of the list.
* Restore the global deferred-event list to its former length,
* discarding any events queued by the subxact.
*/
if (afterTriggers->events.tail != NULL)
afterTriggers->events.tail->ate_next = NULL;
afterTriggerRestoreEventList(&afterTriggers->events,
&afterTriggers->events_stack[my_level]);
/*
* Restore the trigger state. If the saved state is NULL, then this
......@@ -3244,15 +3407,15 @@ AfterTriggerEndSubXact(bool isCommit)
* subxacts started after it.)
*/
subxact_firing_id = afterTriggers->firing_stack[my_level];
for (event = afterTriggers->events.head;
event != NULL;
event = event->ate_next)
for_each_event_chunk(event, chunk, afterTriggers->events)
{
if (event->ate_event &
AfterTriggerShared evtshared = GetTriggerSharedData(event);
if (event->ate_flags &
(AFTER_TRIGGER_DONE | AFTER_TRIGGER_IN_PROGRESS))
{
if (event->ate_firing_id >= subxact_firing_id)
event->ate_event &=
if (evtshared->ats_firing_id >= subxact_firing_id)
event->ate_flags &=
~(AFTER_TRIGGER_DONE | AFTER_TRIGGER_IN_PROGRESS);
}
}
......@@ -3579,8 +3742,9 @@ AfterTriggerSetState(ConstraintsSetStmt *stmt)
* but we'd better not if inside a subtransaction, since the
* subtransaction could later get rolled back.
*/
afterTriggerInvokeEvents(events, firing_id, NULL,
!IsSubTransaction());
if (afterTriggerInvokeEvents(events, firing_id, NULL,
!IsSubTransaction()))
break; /* all fired */
}
}
}
......@@ -3604,6 +3768,7 @@ bool
AfterTriggerPendingOnRel(Oid relid)
{
AfterTriggerEvent event;
AfterTriggerEventChunk *chunk;
int depth;
/* No-op if we aren't in a transaction. (Shouldn't happen?) */
......@@ -3611,19 +3776,19 @@ AfterTriggerPendingOnRel(Oid relid)
return false;
/* Scan queued events */
for (event = afterTriggers->events.head;
event != NULL;
event = event->ate_next)
for_each_event_chunk(event, chunk, afterTriggers->events)
{
AfterTriggerShared evtshared = GetTriggerSharedData(event);
/*
* We can ignore completed events. (Even if a DONE flag is rolled
* back by subxact abort, it's OK because the effects of the TRUNCATE
* or whatever must get rolled back too.)
*/
if (event->ate_event & AFTER_TRIGGER_DONE)
if (event->ate_flags & AFTER_TRIGGER_DONE)
continue;
if (event->ate_relid == relid)
if (evtshared->ats_relid == relid)
return true;
}
......@@ -3634,14 +3799,14 @@ AfterTriggerPendingOnRel(Oid relid)
*/
for (depth = 0; depth <= afterTriggers->query_depth; depth++)
{
for (event = afterTriggers->query_stack[depth].head;
event != NULL;
event = event->ate_next)
for_each_event_chunk(event, chunk, afterTriggers->query_stack[depth])
{
if (event->ate_event & AFTER_TRIGGER_DONE)
AfterTriggerShared evtshared = GetTriggerSharedData(event);
if (event->ate_flags & AFTER_TRIGGER_DONE)
continue;
if (event->ate_relid == relid)
if (evtshared->ats_relid == relid)
return true;
}
}
......@@ -3665,35 +3830,85 @@ AfterTriggerSaveEvent(ResultRelInfo *relinfo, int event, bool row_trigger,
{
Relation rel = relinfo->ri_RelationDesc;
TriggerDesc *trigdesc = relinfo->ri_TrigDesc;
int my_level = GetCurrentTransactionNestLevel();
MemoryContext *cxtptr;
AfterTriggerEvent new_event;
AfterTriggerEventData new_event;
AfterTriggerSharedData new_shared;
int i;
int ntriggers;
int *tgindx;
ItemPointerData oldctid;
ItemPointerData newctid;
if (afterTriggers == NULL)
elog(ERROR, "AfterTriggerSaveEvent() called outside of transaction");
Assert(afterTriggers->query_depth >= 0);
/*
* event is used both as a bitmask and an array offset,
* so make sure we don't walk off the edge of our arrays
*/
Assert(event >= 0 && event < TRIGGER_NUM_EVENT_CLASSES);
/*
* Get the CTID's of OLD and NEW
* Validate the event code and collect the associated tuple CTIDs.
*
* The event code will be used both as a bitmask and an array offset, so
* validation is important to make sure we don't walk off the edge of our
* arrays.
*/
if (oldtup != NULL)
ItemPointerCopy(&(oldtup->t_self), &(oldctid));
new_event.ate_flags = 0;
switch (event)
{
case TRIGGER_EVENT_INSERT:
if (row_trigger)
{
Assert(oldtup == NULL);
Assert(newtup != NULL);
ItemPointerCopy(&(newtup->t_self), &(new_event.ate_ctid1));
ItemPointerSetInvalid(&(new_event.ate_ctid2));
}
else
ItemPointerSetInvalid(&(oldctid));
if (newtup != NULL)
ItemPointerCopy(&(newtup->t_self), &(newctid));
{
Assert(oldtup == NULL);
Assert(newtup == NULL);
ItemPointerSetInvalid(&(new_event.ate_ctid1));
ItemPointerSetInvalid(&(new_event.ate_ctid2));
}
break;
case TRIGGER_EVENT_DELETE:
if (row_trigger)
{
Assert(oldtup != NULL);
Assert(newtup == NULL);
ItemPointerCopy(&(oldtup->t_self), &(new_event.ate_ctid1));
ItemPointerSetInvalid(&(new_event.ate_ctid2));
}
else
{
Assert(oldtup == NULL);
Assert(newtup == NULL);
ItemPointerSetInvalid(&(new_event.ate_ctid1));
ItemPointerSetInvalid(&(new_event.ate_ctid2));
}
break;
case TRIGGER_EVENT_UPDATE:
if (row_trigger)
{
Assert(oldtup != NULL);
Assert(newtup != NULL);
ItemPointerCopy(&(oldtup->t_self), &(new_event.ate_ctid1));
ItemPointerCopy(&(newtup->t_self), &(new_event.ate_ctid2));
new_event.ate_flags |= AFTER_TRIGGER_2CTIDS;
}
else
ItemPointerSetInvalid(&(newctid));
{
Assert(oldtup == NULL);
Assert(newtup == NULL);
ItemPointerSetInvalid(&(new_event.ate_ctid1));
ItemPointerSetInvalid(&(new_event.ate_ctid2));
}
break;
case TRIGGER_EVENT_TRUNCATE:
Assert(oldtup == NULL);
Assert(newtup == NULL);
ItemPointerSetInvalid(&(new_event.ate_ctid1));
ItemPointerSetInvalid(&(new_event.ate_ctid2));
break;
default:
elog(ERROR, "invalid after-trigger event code: %d", event);
break;
}
/*
* Scan the appropriate set of triggers
......@@ -3772,44 +3987,18 @@ AfterTriggerSaveEvent(ResultRelInfo *relinfo, int event, bool row_trigger,
}
/*
* If we don't yet have an event context for the current (sub)xact,
* create one. Make it a child of CurTransactionContext to ensure it
* will go away if the subtransaction aborts.
* Fill in event structure and add it to the current query's queue.
*/
if (my_level > 1) /* subtransaction? */
{
Assert(my_level < afterTriggers->maxtransdepth);
cxtptr = &afterTriggers->cxt_stack[my_level];
}
else
cxtptr = &afterTriggers->event_cxt;
if (*cxtptr == NULL)
*cxtptr = AllocSetContextCreate(CurTransactionContext,
"AfterTriggerEvents",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
/*
* Create a new event.
*/
new_event = (AfterTriggerEvent)
MemoryContextAlloc(*cxtptr, sizeof(AfterTriggerEventData));
new_event->ate_next = NULL;
new_event->ate_event =
new_shared.ats_event =
(event & TRIGGER_EVENT_OPMASK) |
(row_trigger ? TRIGGER_EVENT_ROW : 0) |
(trigger->tgdeferrable ? AFTER_TRIGGER_DEFERRABLE : 0) |
(trigger->tginitdeferred ? AFTER_TRIGGER_INITDEFERRED : 0);
new_event->ate_firing_id = 0;
new_event->ate_tgoid = trigger->tgoid;
new_event->ate_relid = rel->rd_id;
ItemPointerCopy(&oldctid, &(new_event->ate_oldctid));
ItemPointerCopy(&newctid, &(new_event->ate_newctid));
new_shared.ats_tgoid = trigger->tgoid;
new_shared.ats_relid = RelationGetRelid(rel);
new_shared.ats_firing_id = 0;
/*
* Add the new event to the queue.
*/
afterTriggerAddEvent(new_event);
afterTriggerAddEvent(&afterTriggers->query_stack[afterTriggers->query_depth],
&new_event, &new_shared);
}
}
......@@ -6,7 +6,7 @@
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/commands/trigger.h,v 1.68 2008/09/19 14:43:46 mha Exp $
* $PostgreSQL: pgsql/src/include/commands/trigger.h,v 1.69 2008/10/24 23:42:35 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -56,10 +56,8 @@ typedef struct TriggerData
/* More TriggerEvent flags, used only within trigger.c */
#define AFTER_TRIGGER_DONE 0x00000010
#define AFTER_TRIGGER_IN_PROGRESS 0x00000020
#define AFTER_TRIGGER_DEFERRABLE 0x00000040
#define AFTER_TRIGGER_INITDEFERRED 0x00000080
#define AFTER_TRIGGER_DEFERRABLE 0x00000010
#define AFTER_TRIGGER_INITDEFERRED 0x00000020
#define TRIGGER_FIRED_BY_INSERT(event) \
(((TriggerEvent) (event) & TRIGGER_EVENT_OPMASK) == \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment