Commit 7087166a authored by Robert Haas's avatar Robert Haas

pg_upgrade: Convert old visibility map format to new format.

Commit a892234f added a second bit per
page to the visibility map, but pg_upgrade has been unaware of it up
until now.  Therefore, a pg_upgrade from an earlier major release of
PostgreSQL to any commit preceding this one and following the one
mentioned above would result in invalid visibility map contents on the
new cluster, very possibly leading to data corruption.  This plugs
that hole.

Masahiko Sawada, reviewed by Jeff Janes, Bruce Momjian, Simon Riggs,
Michael Paquier, Andres Freund, me, and others.
parent 9118d03a
...@@ -9,10 +9,16 @@ ...@@ -9,10 +9,16 @@
#include "postgres_fe.h" #include "postgres_fe.h"
#include "access/visibilitymap.h"
#include "pg_upgrade.h" #include "pg_upgrade.h"
#include "storage/bufpage.h"
#include "storage/checksum.h"
#include "storage/checksum_impl.h"
#include <sys/stat.h>
#include <fcntl.h> #include <fcntl.h>
#define BITS_PER_HEAPBLOCK_OLD 1
#ifndef WIN32 #ifndef WIN32
...@@ -138,6 +144,154 @@ copy_file(const char *srcfile, const char *dstfile, bool force) ...@@ -138,6 +144,154 @@ copy_file(const char *srcfile, const char *dstfile, bool force)
#endif #endif
/*
* rewriteVisibilityMap()
*
* In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's
* visibility map included one bit per heap page; it now includes two.
* When upgrading a cluster from before that time to a current PostgreSQL
* version, we could refuse to copy visibility maps from the old cluster
* to the new cluster; the next VACUUM would recreate them, but at the
* price of scanning the entire table. So, instead, we rewrite the old
* visibility maps in the new format. That way, the all-visible bit
* remains set for the pages for which it was set previously. The
* all-frozen bit is never set by this conversion; we leave that to
* VACUUM.
*/
const char *
rewriteVisibilityMap(const char *fromfile, const char *tofile, bool force)
{
int src_fd = 0;
int dst_fd = 0;
char buffer[BLCKSZ];
ssize_t bytesRead;
ssize_t src_filesize;
int rewriteVmBytesPerPage;
BlockNumber new_blkno = 0;
struct stat statbuf;
/* Compute we need how many old page bytes to rewrite a new page */
rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2;
if ((fromfile == NULL) || (tofile == NULL))
return "Invalid old file or new file";
if ((src_fd = open(fromfile, O_RDONLY, 0)) < 0)
return getErrorText();
if (fstat(src_fd, &statbuf) != 0)
{
close(src_fd);
return getErrorText();
}
if ((dst_fd = open(tofile, O_RDWR | O_CREAT | (force ? 0 : O_EXCL), S_IRUSR | S_IWUSR)) < 0)
{
close(src_fd);
return getErrorText();
}
/* Save old file size */
src_filesize = statbuf.st_size;
/*
* Turn each visibility map page into 2 pages one by one. Each new page
* has the same page header as the old one. If the last section of last
* page is empty, we skip it, mostly to avoid turning one-page visibility
* maps for small relations into two pages needlessly.
*/
while ((bytesRead = read(src_fd, buffer, BLCKSZ)) == BLCKSZ)
{
char *old_cur;
char *old_break;
char *old_blkend;
PageHeaderData pageheader;
bool old_lastblk = ((BLCKSZ * (new_blkno + 1)) == src_filesize);
/* Save the page header data */
memcpy(&pageheader, buffer, SizeOfPageHeaderData);
/*
* These old_* variables point to old visibility map page. old_cur
* points to current position on old page. old_blkend points to end of
* old block. old_break points to old page break position for
* rewriting a new page. After wrote a new page, old_break proceeds
* rewriteVmBytesPerPage bytes.
*/
old_cur = buffer + SizeOfPageHeaderData;
old_blkend = buffer + bytesRead;
old_break = old_cur + rewriteVmBytesPerPage;
while (old_blkend >= old_break)
{
char new_vmbuf[BLCKSZ];
char *new_cur = new_vmbuf;
bool empty = true;
bool old_lastpart;
/* Copy page header in advance */
memcpy(new_vmbuf, &pageheader, SizeOfPageHeaderData);
/* Rewrite the last part of the old page? */
old_lastpart = old_lastblk && (old_blkend == old_break);
new_cur += SizeOfPageHeaderData;
/* Process old page bytes one by one, and turn it into new page. */
while (old_break > old_cur)
{
uint16 new_vmbits = 0;
int i;
/* Generate new format bits while keeping old information */
for (i = 0; i < BITS_PER_BYTE; i++)
{
uint8 byte = *(uint8 *) old_cur;
if (byte & (1 << (BITS_PER_HEAPBLOCK_OLD * i)))
{
empty = false;
new_vmbits |= 1 << (BITS_PER_HEAPBLOCK * i);
}
}
/* Copy new visibility map bit to new format page */
memcpy(new_cur, &new_vmbits, BITS_PER_HEAPBLOCK);
old_cur += BITS_PER_HEAPBLOCK_OLD;
new_cur += BITS_PER_HEAPBLOCK;
}
/* If the last part of the old page is empty, skip to write it */
if (old_lastpart && empty)
break;
/* Set new checksum for a visibility map page (if enabled) */
if (old_cluster.controldata.data_checksum_version != 0 &&
new_cluster.controldata.data_checksum_version != 0)
((PageHeader) new_vmbuf)->pd_checksum =
pg_checksum_page(new_vmbuf, new_blkno);
if (write(dst_fd, new_vmbuf, BLCKSZ) != BLCKSZ)
{
close(dst_fd);
close(src_fd);
return getErrorText();
}
old_break += rewriteVmBytesPerPage;
new_blkno++;
}
}
/* Close files */
close(dst_fd);
close(src_fd);
return NULL;
}
void void
check_hard_link(void) check_hard_link(void)
{ {
......
...@@ -109,6 +109,10 @@ extern char *output_files[]; ...@@ -109,6 +109,10 @@ extern char *output_files[];
*/ */
#define VISIBILITY_MAP_CRASHSAFE_CAT_VER 201107031 #define VISIBILITY_MAP_CRASHSAFE_CAT_VER 201107031
/*
* The format of visibility map is changed with this 9.6 commit,
*/
#define VISIBILITY_MAP_FROZEN_BIT_CAT_VER 201603011
/* /*
* pg_multixact format changed in 9.3 commit 0ac5ad5134f2769ccbaefec73844f85, * pg_multixact format changed in 9.3 commit 0ac5ad5134f2769ccbaefec73844f85,
* ("Improve concurrency of foreign key locking") which also updated catalog * ("Improve concurrency of foreign key locking") which also updated catalog
...@@ -365,6 +369,8 @@ bool pid_lock_file_exists(const char *datadir); ...@@ -365,6 +369,8 @@ bool pid_lock_file_exists(const char *datadir);
const char *copyFile(const char *src, const char *dst, bool force); const char *copyFile(const char *src, const char *dst, bool force);
const char *linkFile(const char *src, const char *dst); const char *linkFile(const char *src, const char *dst);
const char *rewriteVisibilityMap(const char *fromfile, const char *tofile,
bool force);
void check_hard_link(void); void check_hard_link(void);
FILE *fopen_priv(const char *path, const char *mode); FILE *fopen_priv(const char *path, const char *mode);
......
...@@ -11,12 +11,13 @@ ...@@ -11,12 +11,13 @@
#include "pg_upgrade.h" #include "pg_upgrade.h"
#include <sys/stat.h>
#include "catalog/pg_class.h" #include "catalog/pg_class.h"
#include "access/transam.h" #include "access/transam.h"
static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace); static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace);
static void transfer_relfile(FileNameMap *map, const char *suffix); static void transfer_relfile(FileNameMap *map, const char *suffix, bool vm_must_add_frozenbit);
/* /*
...@@ -132,6 +133,7 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) ...@@ -132,6 +133,7 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
{ {
int mapnum; int mapnum;
bool vm_crashsafe_match = true; bool vm_crashsafe_match = true;
bool vm_must_add_frozenbit = false;
/* /*
* Do the old and new cluster disagree on the crash-safetiness of the vm * Do the old and new cluster disagree on the crash-safetiness of the vm
...@@ -141,13 +143,20 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) ...@@ -141,13 +143,20 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
new_cluster.controldata.cat_ver >= VISIBILITY_MAP_CRASHSAFE_CAT_VER) new_cluster.controldata.cat_ver >= VISIBILITY_MAP_CRASHSAFE_CAT_VER)
vm_crashsafe_match = false; vm_crashsafe_match = false;
/*
* Do we need to rewrite visibilitymap?
*/
if (old_cluster.controldata.cat_ver < VISIBILITY_MAP_FROZEN_BIT_CAT_VER &&
new_cluster.controldata.cat_ver >= VISIBILITY_MAP_FROZEN_BIT_CAT_VER)
vm_must_add_frozenbit = true;
for (mapnum = 0; mapnum < size; mapnum++) for (mapnum = 0; mapnum < size; mapnum++)
{ {
if (old_tablespace == NULL || if (old_tablespace == NULL ||
strcmp(maps[mapnum].old_tablespace, old_tablespace) == 0) strcmp(maps[mapnum].old_tablespace, old_tablespace) == 0)
{ {
/* transfer primary file */ /* transfer primary file */
transfer_relfile(&maps[mapnum], ""); transfer_relfile(&maps[mapnum], "", vm_must_add_frozenbit);
/* fsm/vm files added in PG 8.4 */ /* fsm/vm files added in PG 8.4 */
if (GET_MAJOR_VERSION(old_cluster.major_version) >= 804) if (GET_MAJOR_VERSION(old_cluster.major_version) >= 804)
...@@ -155,9 +164,9 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) ...@@ -155,9 +164,9 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
/* /*
* Copy/link any fsm and vm files, if they exist * Copy/link any fsm and vm files, if they exist
*/ */
transfer_relfile(&maps[mapnum], "_fsm"); transfer_relfile(&maps[mapnum], "_fsm", vm_must_add_frozenbit);
if (vm_crashsafe_match) if (vm_crashsafe_match)
transfer_relfile(&maps[mapnum], "_vm"); transfer_relfile(&maps[mapnum], "_vm", vm_must_add_frozenbit);
} }
} }
} }
...@@ -167,17 +176,19 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) ...@@ -167,17 +176,19 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
/* /*
* transfer_relfile() * transfer_relfile()
* *
* Copy or link file from old cluster to new one. * Copy or link file from old cluster to new one. If vm_must_add_frozenbit
* is true, visibility map forks are converted and rewritten, even in link
* mode.
*/ */
static void static void
transfer_relfile(FileNameMap *map, const char *type_suffix) transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit)
{ {
const char *msg; const char *msg;
char old_file[MAXPGPATH]; char old_file[MAXPGPATH];
char new_file[MAXPGPATH]; char new_file[MAXPGPATH];
int fd;
int segno; int segno;
char extent_suffix[65]; char extent_suffix[65];
struct stat statbuf;
/* /*
* Now copy/link any related segments as well. Remember, PG breaks large * Now copy/link any related segments as well. Remember, PG breaks large
...@@ -210,7 +221,7 @@ transfer_relfile(FileNameMap *map, const char *type_suffix) ...@@ -210,7 +221,7 @@ transfer_relfile(FileNameMap *map, const char *type_suffix)
if (type_suffix[0] != '\0' || segno != 0) if (type_suffix[0] != '\0' || segno != 0)
{ {
/* Did file open fail? */ /* Did file open fail? */
if ((fd = open(old_file, O_RDONLY, 0)) == -1) if (stat(old_file, &statbuf) != 0)
{ {
/* File does not exist? That's OK, just return */ /* File does not exist? That's OK, just return */
if (errno == ENOENT) if (errno == ENOENT)
...@@ -220,7 +231,10 @@ transfer_relfile(FileNameMap *map, const char *type_suffix) ...@@ -220,7 +231,10 @@ transfer_relfile(FileNameMap *map, const char *type_suffix)
map->nspname, map->relname, old_file, new_file, map->nspname, map->relname, old_file, new_file,
getErrorText()); getErrorText());
} }
close(fd);
/* If file is empty, just return */
if (statbuf.st_size == 0)
return;
} }
unlink(new_file); unlink(new_file);
...@@ -232,7 +246,13 @@ transfer_relfile(FileNameMap *map, const char *type_suffix) ...@@ -232,7 +246,13 @@ transfer_relfile(FileNameMap *map, const char *type_suffix)
{ {
pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", old_file, new_file); pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", old_file, new_file);
if ((msg = copyFile(old_file, new_file, true)) != NULL) /* Rewrite visibility map if needed */
if (vm_must_add_frozenbit && (strcmp(type_suffix, "_vm") == 0))
msg = rewriteVisibilityMap(old_file, new_file, true);
else
msg = copyFile(old_file, new_file, true);
if (msg)
pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
map->nspname, map->relname, old_file, new_file, msg); map->nspname, map->relname, old_file, new_file, msg);
} }
...@@ -240,7 +260,13 @@ transfer_relfile(FileNameMap *map, const char *type_suffix) ...@@ -240,7 +260,13 @@ transfer_relfile(FileNameMap *map, const char *type_suffix)
{ {
pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"\n", old_file, new_file); pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"\n", old_file, new_file);
if ((msg = linkFile(old_file, new_file)) != NULL) /* Rewrite visibility map if needed */
if (vm_must_add_frozenbit && (strcmp(type_suffix, "_vm") == 0))
msg = rewriteVisibilityMap(old_file, new_file, true);
else
msg = linkFile(old_file, new_file);
if (msg)
pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
map->nspname, map->relname, old_file, new_file, msg); map->nspname, map->relname, old_file, new_file, msg);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment