004_verify_heapam.pl 18.1 KB
Newer Older
1 2 3

# Copyright (c) 2021, PostgreSQL Global Development Group

4 5 6 7 8 9
use strict;
use warnings;

use PostgresNode;
use TestLib;

10
use Fcntl qw(:seek);
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
use Test::More;

# This regression test demonstrates that the pg_amcheck binary correctly
# identifies specific kinds of corruption within pages.  To test this, we need
# a mechanism to create corrupt pages with predictable, repeatable corruption.
# The postgres backend cannot be expected to help us with this, as its design
# is not consistent with the goal of intentionally corrupting pages.
#
# Instead, we create a table to corrupt, and with careful consideration of how
# postgresql lays out heap pages, we seek to offsets within the page and
# overwrite deliberately chosen bytes with specific values calculated to
# corrupt the page in expected ways.  We then verify that pg_amcheck reports
# the corruption, and that it runs without crashing.  Note that the backend
# cannot simply be started to run queries against the corrupt table, as the
# backend will crash, at least for some of the corruption types we generate.
#
# Autovacuum potentially touching the table in the background makes the exact
# behavior of this test harder to reason about.  We turn it off to keep things
# simpler.  We use a "belt and suspenders" approach, turning it off for the
# system generally in postgresql.conf, and turning it off specifically for the
# test table.
#
# This test depends on the table being written to the heap file exactly as we
# expect it to be, so we take care to arrange the columns of the table, and
# insert rows of the table, that give predictable sizes and locations within
# the table page.
#
# The HeapTupleHeaderData has 23 bytes of fixed size fields before the variable
# length t_bits[] array.  We have exactly 3 columns in the table, so natts = 3,
# t_bits is 1 byte long, and t_hoff = MAXALIGN(23 + 1) = 24.
#
# We're not too fussy about which datatypes we use for the test, but we do care
# about some specific properties.  We'd like to test both fixed size and
# varlena types.  We'd like some varlena data inline and some toasted.  And
# we'd like the layout of the table such that the datums land at predictable
# offsets within the tuple.  We choose a structure without padding on all
# supported architectures:
#
# 	a BIGINT
#	b TEXT
#	c TEXT
#
# We always insert a 7-ascii character string into field 'b', which with a
# 1-byte varlena header gives an 8 byte inline value.  We always insert a long
# text string in field 'c', long enough to force toast storage.
#
# We choose to read and write binary copies of our table's tuples, using perl's
# pack() and unpack() functions.  Perl uses a packing code system in which:
#
60
#	l = "signed 32-bit Long",
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
#	L = "Unsigned 32-bit Long",
#	S = "Unsigned 16-bit Short",
#	C = "Unsigned 8-bit Octet",
#
# Each tuple in our table has a layout as follows:
#
#    xx xx xx xx            t_xmin: xxxx		offset = 0		L
#    xx xx xx xx            t_xmax: xxxx		offset = 4		L
#    xx xx xx xx          t_field3: xxxx		offset = 8		L
#    xx xx                   bi_hi: xx			offset = 12		S
#    xx xx                   bi_lo: xx			offset = 14		S
#    xx xx                ip_posid: xx			offset = 16		S
#    xx xx             t_infomask2: xx			offset = 18		S
#    xx xx              t_infomask: xx			offset = 20		S
#    xx                     t_hoff: x			offset = 22		C
#    xx                     t_bits: x			offset = 23		C
77
#    xx xx xx xx xx xx xx xx   'a': xxxxxxxx	offset = 24		LL
78 79 80 81
#    xx xx xx xx xx xx xx xx   'b': xxxxxxxx	offset = 32		CCCCCCCC
#    xx xx xx xx xx xx xx xx   'c': xxxxxxxx	offset = 40		CCllLL
#    xx xx xx xx xx xx xx xx      : xxxxxxxx	 ...continued
#    xx xx                        : xx      	 ...continued
82 83 84 85 86
#
# We could choose to read and write columns 'b' and 'c' in other ways, but
# it is convenient enough to do it this way.  We define packing code
# constants here, where they can be compared easily against the layout.

87
use constant HEAPTUPLE_PACK_CODE => 'LLLSSSSSCCLLCCCCCCCCCCllLL';
88
use constant HEAPTUPLE_PACK_LENGTH => 58;    # Total size
89 90 91 92 93 94 95 96 97

# Read a tuple of our table from a heap page.
#
# Takes an open filehandle to the heap file, and the offset of the tuple.
#
# Rather than returning the binary data from the file, unpacks the data into a
# perl hash with named fields.  These fields exactly match the ones understood
# by write_tuple(), below.  Returns a reference to this hash.
#
Robert Haas's avatar
Robert Haas committed
98
sub read_tuple
99 100 101
{
	my ($fh, $offset) = @_;
	my ($buffer, %tup);
102
	seek($fh, $offset, SEEK_SET)
103
	  or BAIL_OUT("seek failed: $!");
104
	defined(sysread($fh, $buffer, HEAPTUPLE_PACK_LENGTH))
105
	  or BAIL_OUT("sysread failed: $!");
106 107

	@_ = unpack(HEAPTUPLE_PACK_CODE, $buffer);
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
	%tup = (
		t_xmin          => shift,
		t_xmax          => shift,
		t_field3        => shift,
		bi_hi           => shift,
		bi_lo           => shift,
		ip_posid        => shift,
		t_infomask2     => shift,
		t_infomask      => shift,
		t_hoff          => shift,
		t_bits          => shift,
		a_1             => shift,
		a_2             => shift,
		b_header        => shift,
		b_body1         => shift,
		b_body2         => shift,
		b_body3         => shift,
		b_body4         => shift,
		b_body5         => shift,
		b_body6         => shift,
		b_body7         => shift,
		c_va_header     => shift,
		c_va_vartag     => shift,
		c_va_rawsize    => shift,
		c_va_extinfo    => shift,
		c_va_valueid    => shift,
		c_va_toastrelid => shift);
135
	# Stitch together the text for column 'b'
136
	$tup{b} = join('', map { chr($tup{"b_body$_"}) } (1 .. 7));
137 138 139 140 141 142 143 144 145 146 147 148 149
	return \%tup;
}

# Write a tuple of our table to a heap page.
#
# Takes an open filehandle to the heap file, the offset of the tuple, and a
# reference to a hash with the tuple values, as returned by read_tuple().
# Writes the tuple fields from the hash into the heap file.
#
# The purpose of this function is to write a tuple back to disk with some
# subset of fields modified.  The function does no error checking.  Use
# cautiously.
#
Robert Haas's avatar
Robert Haas committed
150
sub write_tuple
151 152
{
	my ($fh, $offset, $tup) = @_;
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
	my $buffer = pack(
		HEAPTUPLE_PACK_CODE,
		$tup->{t_xmin},       $tup->{t_xmax},
		$tup->{t_field3},     $tup->{bi_hi},
		$tup->{bi_lo},        $tup->{ip_posid},
		$tup->{t_infomask2},  $tup->{t_infomask},
		$tup->{t_hoff},       $tup->{t_bits},
		$tup->{a_1},          $tup->{a_2},
		$tup->{b_header},     $tup->{b_body1},
		$tup->{b_body2},      $tup->{b_body3},
		$tup->{b_body4},      $tup->{b_body5},
		$tup->{b_body6},      $tup->{b_body7},
		$tup->{c_va_header},  $tup->{c_va_vartag},
		$tup->{c_va_rawsize}, $tup->{c_va_extinfo},
		$tup->{c_va_valueid}, $tup->{c_va_toastrelid});
168
	seek($fh, $offset, SEEK_SET)
169
	  or BAIL_OUT("seek failed: $!");
170
	defined(syswrite($fh, $buffer, HEAPTUPLE_PACK_LENGTH))
171
	  or BAIL_OUT("syswrite failed: $!");
172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
	return;
}

# Set umask so test directories and files are created with default permissions
umask(0077);

# Set up the node.  Once we create and corrupt the table,
# autovacuum workers visiting the table could crash the backend.
# Disable autovacuum so that won't happen.
my $node = get_new_node('test');
$node->init;
$node->append_conf('postgresql.conf', 'autovacuum=off');

# Start the node and load the extensions.  We depend on both
# amcheck and pageinspect for this test.
$node->start;
188
my $port   = $node->port;
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
my $pgdata = $node->data_dir;
$node->safe_psql('postgres', "CREATE EXTENSION amcheck");
$node->safe_psql('postgres', "CREATE EXTENSION pageinspect");

# Get a non-zero datfrozenxid
$node->safe_psql('postgres', qq(VACUUM FREEZE));

# Create the test table with precisely the schema that our corruption function
# expects.
$node->safe_psql(
	'postgres', qq(
		CREATE TABLE public.test (a BIGINT, b TEXT, c TEXT);
		ALTER TABLE public.test SET (autovacuum_enabled=false);
		ALTER TABLE public.test ALTER COLUMN c SET STORAGE EXTERNAL;
		CREATE INDEX test_idx ON public.test(a, b);
	));

# We want (0 < datfrozenxid < test.relfrozenxid).  To achieve this, we freeze
# an otherwise unused table, public.junk, prior to inserting data and freezing
# public.test
$node->safe_psql(
	'postgres', qq(
		CREATE TABLE public.junk AS SELECT 'junk'::TEXT AS junk_column;
		ALTER TABLE public.junk SET (autovacuum_enabled=false);
		VACUUM FREEZE public.junk
	));

216 217
my $rel = $node->safe_psql('postgres',
	qq(SELECT pg_relation_filepath('public.test')));
218 219 220 221
my $relpath = "$pgdata/$rel";

# Insert data and freeze public.test
use constant ROWCOUNT => 16;
222 223
$node->safe_psql(
	'postgres', qq(
224 225
	INSERT INTO public.test (a, b, c)
		VALUES (
226
			x'DEADF9F9DEADF9F9'::bigint,
227 228 229 230
			'abcdefg',
			repeat('w', 10000)
		);
	VACUUM FREEZE public.test
231
	)) for (1 .. ROWCOUNT);
232 233 234 235 236 237 238 239 240 241 242 243

my $relfrozenxid = $node->safe_psql('postgres',
	q(select relfrozenxid from pg_class where relname = 'test'));
my $datfrozenxid = $node->safe_psql('postgres',
	q(select datfrozenxid from pg_database where datname = 'postgres'));

# Sanity check that our 'test' table has a relfrozenxid newer than the
# datfrozenxid for the database, and that the datfrozenxid is greater than the
# first normal xid.  We rely on these invariants in some of our tests.
if ($datfrozenxid <= 3 || $datfrozenxid >= $relfrozenxid)
{
	$node->clean_node;
244 245
	plan skip_all =>
	  "Xid thresholds not as expected: got datfrozenxid = $datfrozenxid, relfrozenxid = $relfrozenxid";
246 247 248 249 250
	exit;
}

# Find where each of the tuples is located on the page.
my @lp_off;
251
for my $tup (0 .. ROWCOUNT - 1)
252
{
253 254 255 256
	push(
		@lp_off,
		$node->safe_psql(
			'postgres', qq(
257 258 259 260 261 262 263 264 265 266
select lp_off from heap_page_items(get_raw_page('test', 'main', 0))
	offset $tup limit 1)));
}

# Sanity check that our 'test' table on disk layout matches expectations.  If
# this is not so, we will have to skip the test until somebody updates the test
# to work on this platform.
$node->stop;
my $file;
open($file, '+<', $relpath)
267
  or BAIL_OUT("open failed: $!");
268 269
binmode $file;

270
my $ENDIANNESS;
271 272
for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++)
{
273
	my $offnum = $tupidx + 1;        # offnum is 1-based, not zero-based
274 275 276 277
	my $offset = $lp_off[$tupidx];
	my $tup = read_tuple($file, $offset);

	# Sanity-check that the data appears on the page where we expect.
278 279
	my $a_1 = $tup->{a_1};
	my $a_2 = $tup->{a_2};
280
	my $b   = $tup->{b};
281
	if ($a_1 != 0xDEADF9F9 || $a_2 != 0xDEADF9F9 || $b ne 'abcdefg')
282
	{
283
		close($file);    # ignore errors on close; we're exiting anyway
284
		$node->clean_node;
285 286 287 288
		plan skip_all =>
		  sprintf(
			"Page layout differs from our expectations: expected (%x, %x, \"%s\"), got (%x, %x, \"%s\")",
			0xDEADF9F9, 0xDEADF9F9, "abcdefg", $a_1, $a_2, $b);
289 290
		exit;
	}
291 292 293

	# Determine endianness of current platform from the 1-byte varlena header
	$ENDIANNESS = $tup->{b_header} == 0x11 ? "little" : "big";
294 295
}
close($file)
296
  or BAIL_OUT("close failed: $!");
297 298 299
$node->start;

# Ok, Xids and page layout look ok.  We can run corruption tests.
300
plan tests => 19;
301 302

# Check that pg_amcheck runs against the uncorrupted table without error.
303 304 305
$node->command_ok(
	[ 'pg_amcheck', '-p', $port, 'postgres' ],
	'pg_amcheck test table, prior to corruption');
306 307

# Check that pg_amcheck runs against the uncorrupted table and index without error.
308 309
$node->command_ok([ 'pg_amcheck', '-p', $port, 'postgres' ],
	'pg_amcheck test table and index, prior to corruption');
310 311 312 313

$node->stop;

# Some #define constants from access/htup_details.h for use while corrupting.
314 315 316 317 318 319 320 321 322
use constant HEAP_HASNULL        => 0x0001;
use constant HEAP_XMAX_LOCK_ONLY => 0x0080;
use constant HEAP_XMIN_COMMITTED => 0x0100;
use constant HEAP_XMIN_INVALID   => 0x0200;
use constant HEAP_XMAX_COMMITTED => 0x0400;
use constant HEAP_XMAX_INVALID   => 0x0800;
use constant HEAP_NATTS_MASK     => 0x07FF;
use constant HEAP_XMAX_IS_MULTI  => 0x1000;
use constant HEAP_KEYS_UPDATED   => 0x2000;
323 324 325 326 327 328

# Helper function to generate a regular expression matching the header we
# expect verify_heapam() to return given which fields we expect to be non-null.
sub header
{
	my ($blkno, $offnum, $attnum) = @_;
329
	return
330
	  qr/heap table "postgres\.public\.test", block $blkno, offset $offnum, attribute $attnum:\s+/ms
331 332
	  if (defined $attnum);
	return
333
	  qr/heap table "postgres\.public\.test", block $blkno, offset $offnum:\s+/ms
334
	  if (defined $offnum);
335
	return qr/heap table "postgres\.public\.test", block $blkno:\s+/ms
336
	  if (defined $blkno);
337
	return qr/heap table "postgres\.public\.test":\s+/ms;
338 339 340 341 342 343 344 345 346
}

# Corrupt the tuples, one type of corruption per tuple.  Some types of
# corruption cause verify_heapam to skip to the next tuple without
# performing any remaining checks, so we can't exercise the system properly if
# we focus all our corruption on a single tuple.
#
my @expected;
open($file, '+<', $relpath)
347
  or BAIL_OUT("open failed: $!");
348 349 350 351
binmode $file;

for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++)
{
352
	my $offnum = $tupidx + 1;        # offnum is 1-based, not zero-based
353 354 355 356 357 358 359 360 361 362 363 364 365 366
	my $offset = $lp_off[$tupidx];
	my $tup = read_tuple($file, $offset);

	my $header = header(0, $offnum, undef);
	if ($offnum == 1)
	{
		# Corruptly set xmin < relfrozenxid
		my $xmin = $relfrozenxid - 1;
		$tup->{t_xmin} = $xmin;
		$tup->{t_infomask} &= ~HEAP_XMIN_COMMITTED;
		$tup->{t_infomask} &= ~HEAP_XMIN_INVALID;

		# Expected corruption report
		push @expected,
367
		  qr/${header}xmin $xmin precedes relation freeze threshold 0:\d+/;
368 369 370 371 372 373 374 375 376 377
	}
	if ($offnum == 2)
	{
		# Corruptly set xmin < datfrozenxid
		my $xmin = 3;
		$tup->{t_xmin} = $xmin;
		$tup->{t_infomask} &= ~HEAP_XMIN_COMMITTED;
		$tup->{t_infomask} &= ~HEAP_XMIN_INVALID;

		push @expected,
378
		  qr/${$header}xmin $xmin precedes oldest valid transaction ID 0:\d+/;
379 380 381 382 383 384 385 386 387 388 389
	}
	elsif ($offnum == 3)
	{
		# Corruptly set xmin < datfrozenxid, further back, noting circularity
		# of xid comparison.  For a new cluster with epoch = 0, the corrupt
		# xmin will be interpreted as in the future
		$tup->{t_xmin} = 4026531839;
		$tup->{t_infomask} &= ~HEAP_XMIN_COMMITTED;
		$tup->{t_infomask} &= ~HEAP_XMIN_INVALID;

		push @expected,
390
		  qr/${$header}xmin 4026531839 equals or exceeds next valid transaction ID 0:\d+/;
391 392 393 394 395 396 397 398
	}
	elsif ($offnum == 4)
	{
		# Corruptly set xmax < relminmxid;
		$tup->{t_xmax} = 4026531839;
		$tup->{t_infomask} &= ~HEAP_XMAX_INVALID;

		push @expected,
399
		  qr/${$header}xmax 4026531839 equals or exceeds next valid transaction ID 0:\d+/;
400 401 402 403 404 405 406
	}
	elsif ($offnum == 5)
	{
		# Corrupt the tuple t_hoff, but keep it aligned properly
		$tup->{t_hoff} += 128;

		push @expected,
407 408
		  qr/${$header}data begins at offset 152 beyond the tuple length 58/,
		  qr/${$header}tuple data should begin at byte 24, but actually begins at byte 152 \(3 attributes, no nulls\)/;
409 410 411 412 413 414 415
	}
	elsif ($offnum == 6)
	{
		# Corrupt the tuple t_hoff, wrong alignment
		$tup->{t_hoff} += 3;

		push @expected,
416
		  qr/${$header}tuple data should begin at byte 24, but actually begins at byte 27 \(3 attributes, no nulls\)/;
417 418 419 420 421 422 423
	}
	elsif ($offnum == 7)
	{
		# Corrupt the tuple t_hoff, underflow but correct alignment
		$tup->{t_hoff} -= 8;

		push @expected,
424
		  qr/${$header}tuple data should begin at byte 24, but actually begins at byte 16 \(3 attributes, no nulls\)/;
425 426 427 428 429 430 431
	}
	elsif ($offnum == 8)
	{
		# Corrupt the tuple t_hoff, underflow and wrong alignment
		$tup->{t_hoff} -= 3;

		push @expected,
432
		  qr/${$header}tuple data should begin at byte 24, but actually begins at byte 21 \(3 attributes, no nulls\)/;
433 434 435 436 437 438 439
	}
	elsif ($offnum == 9)
	{
		# Corrupt the tuple to look like it has lots of attributes, not just 3
		$tup->{t_infomask2} |= HEAP_NATTS_MASK;

		push @expected,
440
		  qr/${$header}number of attributes 2047 exceeds maximum expected for table 3/;
441 442 443 444 445 446
	}
	elsif ($offnum == 10)
	{
		# Corrupt the tuple to look like it has lots of attributes, some of
		# them null.  This falsely creates the impression that the t_bits
		# array is longer than just one byte, but t_hoff still says otherwise.
447
		$tup->{t_infomask}  |= HEAP_HASNULL;
448 449 450 451
		$tup->{t_infomask2} |= HEAP_NATTS_MASK;
		$tup->{t_bits} = 0xAA;

		push @expected,
452
		  qr/${$header}tuple data should begin at byte 280, but actually begins at byte 24 \(2047 attributes, has nulls\)/;
453 454 455 456
	}
	elsif ($offnum == 11)
	{
		# Same as above, but this time t_hoff plays along
457
		$tup->{t_infomask}  |= HEAP_HASNULL;
458 459 460 461 462
		$tup->{t_infomask2} |= (HEAP_NATTS_MASK & 0x40);
		$tup->{t_bits} = 0xAA;
		$tup->{t_hoff} = 32;

		push @expected,
463
		  qr/${$header}number of attributes 67 exceeds maximum expected for table 3/;
464 465 466
	}
	elsif ($offnum == 12)
	{
467 468 469 470 471 472 473 474 475 476 477 478 479 480
		# Overwrite column 'b' 1-byte varlena header and initial characters to
		# look like a long 4-byte varlena
		#
		# On little endian machines, bytes ending in two zero bits (xxxxxx00 bytes)
		# are 4-byte length word, aligned, uncompressed data (up to 1G).  We set the
		# high six bits to 111111 and the lower two bits to 00, then the next three
		# bytes with 0xFF using 0xFCFFFFFF.
		#
		# On big endian machines, bytes starting in two zero bits (00xxxxxx bytes)
		# are 4-byte length word, aligned, uncompressed data (up to 1G).  We set the
		# low six bits to 111111 and the high two bits to 00, then the next three
		# bytes with 0xFF using 0x3FFFFFFF.
		#
		$tup->{b_header} = $ENDIANNESS eq 'little' ? 0xFC : 0x3F;
481 482 483
		$tup->{b_body1}  = 0xFF;
		$tup->{b_body2}  = 0xFF;
		$tup->{b_body3}  = 0xFF;
484 485 486

		$header = header(0, $offnum, 1);
		push @expected,
487
		  qr/${header}attribute with length \d+ ends at offset \d+ beyond total tuple length \d+/;
488 489 490 491
	}
	elsif ($offnum == 13)
	{
		# Corrupt the bits in column 'c' toast pointer
492
		$tup->{c_va_valueid} = 0xFFFFFFFF;
493 494

		$header = header(0, $offnum, 2);
495
		push @expected, qr/${header}toast value \d+ not found in toast table/;
496 497 498 499 500 501 502 503 504
	}
	elsif ($offnum == 14)
	{
		# Set both HEAP_XMAX_COMMITTED and HEAP_XMAX_IS_MULTI
		$tup->{t_infomask} |= HEAP_XMAX_COMMITTED;
		$tup->{t_infomask} |= HEAP_XMAX_IS_MULTI;
		$tup->{t_xmax} = 4;

		push @expected,
505
		  qr/${header}multitransaction ID 4 equals or exceeds next valid multitransaction ID 1/;
506
	}
507
	elsif ($offnum == 15)    # Last offnum must equal ROWCOUNT
508 509 510 511 512 513 514
	{
		# Set both HEAP_XMAX_COMMITTED and HEAP_XMAX_IS_MULTI
		$tup->{t_infomask} |= HEAP_XMAX_COMMITTED;
		$tup->{t_infomask} |= HEAP_XMAX_IS_MULTI;
		$tup->{t_xmax} = 4000000000;

		push @expected,
515
		  qr/${header}multitransaction ID 4000000000 precedes relation minimum multitransaction ID threshold 1/;
516 517 518 519
	}
	write_tuple($file, $offset, $tup);
}
close($file)
520
  or BAIL_OUT("close failed: $!");
521 522 523 524 525
$node->start;

# Run pg_amcheck against the corrupt table with epoch=0, comparing actual
# corruption messages against the expected messages
$node->command_checks_all(
526 527
	[ 'pg_amcheck', '--no-dependent-indexes', '-p', $port, 'postgres' ],
	2, [@expected], [], 'Expected corruption message output');
528 529 530

$node->teardown_node;
$node->clean_node;