init

2026-04-07 13:35:22 +08:00
commit 0120fa9ce3
1530 changed files with 424864 additions and 0 deletions
--- a/pg_include/storage/backendid.h
+++ b/pg_include/storage/backendid.h
@@ -0,0 +1,27 @@
+/*-------------------------------------------------------------------------
+ *
+ * backendid.h
+ *	  POSTGRES backend id communication definitions
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/backendid.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BACKENDID_H
+#define BACKENDID_H
+
+/* ----------------
+ *		-cim 8/17/90
+ * ----------------
+ */
+typedef int BackendId;			/* unique currently active backend identifier */
+
+#define InvalidBackendId		(-1)
+
+extern PGDLLIMPORT BackendId MyBackendId;		/* backend id of this backend */
+
+#endif   /* BACKENDID_H */
--- a/pg_include/storage/barrier.h
+++ b/pg_include/storage/barrier.h
@@ -0,0 +1,170 @@
+/*-------------------------------------------------------------------------
+ *
+ * barrier.h
+ *	  Memory barrier operations.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/barrier.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BARRIER_H
+#define BARRIER_H
+
+#include "storage/s_lock.h"
+
+extern slock_t dummy_spinlock;
+
+/*
+ * A compiler barrier need not (and preferably should not) emit any actual
+ * machine code, but must act as an optimization fence: the compiler must not
+ * reorder loads or stores to main memory around the barrier.  However, the
+ * CPU may still reorder loads or stores at runtime, if the architecture's
+ * memory model permits this.
+ *
+ * A memory barrier must act as a compiler barrier, and in addition must
+ * guarantee that all loads and stores issued prior to the barrier are
+ * completed before any loads or stores issued after the barrier.  Unless
+ * loads and stores are totally ordered (which is not the case on most
+ * architectures) this requires issuing some sort of memory fencing
+ * instruction.
+ *
+ * A read barrier must act as a compiler barrier, and in addition must
+ * guarantee that any loads issued prior to the barrier are completed before
+ * any loads issued after the barrier.	Similarly, a write barrier acts
+ * as a compiler barrier, and also orders stores.  Read and write barriers
+ * are thus weaker than a full memory barrier, but stronger than a compiler
+ * barrier.  In practice, on machines with strong memory ordering, read and
+ * write barriers may require nothing more than a compiler barrier.
+ *
+ * For an introduction to using memory barriers within the PostgreSQL backend,
+ * see src/backend/storage/lmgr/README.barrier
+ */
+
+#if defined(DISABLE_BARRIERS)
+
+/*
+ * Fall through to the spinlock-based implementation.
+ */
+#elif defined(__INTEL_COMPILER)
+
+/*
+ * icc defines __GNUC__, but doesn't support gcc's inline asm syntax
+ */
+#define pg_memory_barrier()		_mm_mfence()
+#define pg_compiler_barrier()	__memory_barrier()
+#elif defined(__GNUC__)
+
+/* This works on any architecture, since it's only talking to GCC itself. */
+#define pg_compiler_barrier()	__asm__ __volatile__("" : : : "memory")
+
+#if defined(__i386__)
+
+/*
+ * i386 does not allow loads to be reordered with other loads, or stores to be
+ * reordered with other stores, but a load can be performed before a subsequent
+ * store.
+ *
+ * "lock; addl" has worked for longer than "mfence".
+ */
+#define pg_memory_barrier()		\
+	__asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory")
+#define pg_read_barrier()		pg_compiler_barrier()
+#define pg_write_barrier()		pg_compiler_barrier()
+#elif defined(__x86_64__)		/* 64 bit x86 */
+
+/*
+ * x86_64 has similar ordering characteristics to i386.
+ *
+ * Technically, some x86-ish chips support uncached memory access and/or
+ * special instructions that are weakly ordered.  In those cases we'd need
+ * the read and write barriers to be lfence and sfence.  But since we don't
+ * do those things, a compiler barrier should be enough.
+ */
+#define pg_memory_barrier()		\
+	__asm__ __volatile__ ("lock; addl $0,0(%%rsp)" : : : "memory")
+#define pg_read_barrier()		pg_compiler_barrier()
+#define pg_write_barrier()		pg_compiler_barrier()
+#elif defined(__ia64__) || defined(__ia64)
+
+/*
+ * Itanium is weakly ordered, so read and write barriers require a full
+ * fence.
+ */
+#define pg_memory_barrier()		__asm__ __volatile__ ("mf" : : : "memory")
+#elif defined(__ppc__) || defined(__powerpc__) || defined(__ppc64__) || defined(__powerpc64__)
+
+/*
+ * lwsync orders loads with respect to each other, and similarly with stores.
+ * But a load can be performed before a subsequent store, so sync must be used
+ * for a full memory barrier.
+ */
+#define pg_memory_barrier()		__asm__ __volatile__ ("sync" : : : "memory")
+#define pg_read_barrier()		__asm__ __volatile__ ("lwsync" : : : "memory")
+#define pg_write_barrier()		__asm__ __volatile__ ("lwsync" : : : "memory")
+#elif defined(__alpha) || defined(__alpha__)	/* Alpha */
+
+/*
+ * Unlike all other known architectures, Alpha allows dependent reads to be
+ * reordered, but we don't currently find it necessary to provide a conditional
+ * read barrier to cover that case.  We might need to add that later.
+ */
+#define pg_memory_barrier()		__asm__ __volatile__ ("mb" : : : "memory")
+#define pg_read_barrier()		__asm__ __volatile__ ("rmb" : : : "memory")
+#define pg_write_barrier()		__asm__ __volatile__ ("wmb" : : : "memory")
+#elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)
+
+/*
+ * If we're on GCC 4.1.0 or higher, we should be able to get a memory
+ * barrier out of this compiler built-in.  But we prefer to rely on our
+ * own definitions where possible, and use this only as a fallback.
+ */
+#define pg_memory_barrier()		__sync_synchronize()
+#endif
+#elif defined(__ia64__) || defined(__ia64)
+
+#define pg_compiler_barrier()	_Asm_sched_fence()
+#define pg_memory_barrier()		_Asm_mf()
+#elif defined(WIN32_ONLY_COMPILER)
+
+/* Should work on both MSVC and Borland. */
+#include <intrin.h>
+#pragma intrinsic(_ReadWriteBarrier)
+#define pg_compiler_barrier()	_ReadWriteBarrier()
+#define pg_memory_barrier()		MemoryBarrier()
+#endif
+
+/*
+ * If we have no memory barrier implementation for this architecture, we
+ * fall back to acquiring and releasing a spinlock.  This might, in turn,
+ * fall back to the semaphore-based spinlock implementation, which will be
+ * amazingly slow.
+ *
+ * It's not self-evident that every possible legal implementation of a
+ * spinlock acquire-and-release would be equivalent to a full memory barrier.
+ * For example, I'm not sure that Itanium's acq and rel add up to a full
+ * fence.  But all of our actual implementations seem OK in this regard.
+ */
+#if !defined(pg_memory_barrier)
+#define pg_memory_barrier(x) \
+	do { S_LOCK(&dummy_spinlock); S_UNLOCK(&dummy_spinlock); } while (0)
+#endif
+
+/*
+ * If read or write barriers are undefined, we upgrade them to full memory
+ * barriers.
+ *
+ * If a compiler barrier is unavailable, you probably don't want a full
+ * memory barrier instead, so if you have a use case for a compiler barrier,
+ * you'd better use #ifdef.
+ */
+#if !defined(pg_read_barrier)
+#define pg_read_barrier()			pg_memory_barrier()
+#endif
+#if !defined(pg_write_barrier)
+#define pg_write_barrier()			pg_memory_barrier()
+#endif
+
+#endif   /* BARRIER_H */
--- a/pg_include/storage/block.h
+++ b/pg_include/storage/block.h
@@ -0,0 +1,121 @@
+/*-------------------------------------------------------------------------
+ *
+ * block.h
+ *	  POSTGRES disk block definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/block.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BLOCK_H
+#define BLOCK_H
+
+/*
+ * BlockNumber:
+ *
+ * each data file (heap or index) is divided into postgres disk blocks
+ * (which may be thought of as the unit of i/o -- a postgres buffer
+ * contains exactly one disk block).  the blocks are numbered
+ * sequentially, 0 to 0xFFFFFFFE.
+ *
+ * InvalidBlockNumber is the same thing as P_NEW in buf.h.
+ *
+ * the access methods, the buffer manager and the storage manager are
+ * more or less the only pieces of code that should be accessing disk
+ * blocks directly.
+ */
+typedef uint32 BlockNumber;
+
+#define InvalidBlockNumber		((BlockNumber) 0xFFFFFFFF)
+
+#define MaxBlockNumber			((BlockNumber) 0xFFFFFFFE)
+
+/*
+ * BlockId:
+ *
+ * this is a storage type for BlockNumber.	in other words, this type
+ * is used for on-disk structures (e.g., in HeapTupleData) whereas
+ * BlockNumber is the type on which calculations are performed (e.g.,
+ * in access method code).
+ *
+ * there doesn't appear to be any reason to have separate types except
+ * for the fact that BlockIds can be SHORTALIGN'd (and therefore any
+ * structures that contains them, such as ItemPointerData, can also be
+ * SHORTALIGN'd).  this is an important consideration for reducing the
+ * space requirements of the line pointer (ItemIdData) array on each
+ * page and the header of each heap or index tuple, so it doesn't seem
+ * wise to change this without good reason.
+ */
+typedef struct BlockIdData
+{
+	uint16		bi_hi;
+	uint16		bi_lo;
+} BlockIdData;
+
+typedef BlockIdData *BlockId;	/* block identifier */
+
+/* ----------------
+ *		support macros
+ * ----------------
+ */
+
+/*
+ * BlockNumberIsValid
+ *		True iff blockNumber is valid.
+ */
+#define BlockNumberIsValid(blockNumber) \
+	((bool) ((BlockNumber) (blockNumber) != InvalidBlockNumber))
+
+/*
+ * BlockIdIsValid
+ *		True iff the block identifier is valid.
+ */
+#define BlockIdIsValid(blockId) \
+	((bool) PointerIsValid(blockId))
+
+/*
+ * BlockIdSet
+ *		Sets a block identifier to the specified value.
+ */
+#define BlockIdSet(blockId, blockNumber) \
+( \
+	AssertMacro(PointerIsValid(blockId)), \
+	(blockId)->bi_hi = (blockNumber) >> 16, \
+	(blockId)->bi_lo = (blockNumber) & 0xffff \
+)
+
+/*
+ * BlockIdCopy
+ *		Copy a block identifier.
+ */
+#define BlockIdCopy(toBlockId, fromBlockId) \
+( \
+	AssertMacro(PointerIsValid(toBlockId)), \
+	AssertMacro(PointerIsValid(fromBlockId)), \
+	(toBlockId)->bi_hi = (fromBlockId)->bi_hi, \
+	(toBlockId)->bi_lo = (fromBlockId)->bi_lo \
+)
+
+/*
+ * BlockIdEquals
+ *		Check for block number equality.
+ */
+#define BlockIdEquals(blockId1, blockId2) \
+	((blockId1)->bi_hi == (blockId2)->bi_hi && \
+	 (blockId1)->bi_lo == (blockId2)->bi_lo)
+
+/*
+ * BlockIdGetBlockNumber
+ *		Retrieve the block number from a block identifier.
+ */
+#define BlockIdGetBlockNumber(blockId) \
+( \
+	AssertMacro(BlockIdIsValid(blockId)), \
+	(BlockNumber) (((blockId)->bi_hi << 16) | ((uint16) (blockId)->bi_lo)) \
+)
+
+#endif   /* BLOCK_H */
--- a/pg_include/storage/buf.h
+++ b/pg_include/storage/buf.h
@@ -0,0 +1,46 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf.h
+ *	  Basic buffer manager data types.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/buf.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BUF_H
+#define BUF_H
+
+/*
+ * Buffer identifiers.
+ *
+ * Zero is invalid, positive is the index of a shared buffer (1..NBuffers),
+ * negative is the index of a local buffer (-1 .. -NLocBuffer).
+ */
+typedef int Buffer;
+
+#define InvalidBuffer	0
+
+/*
+ * BufferIsInvalid
+ *		True iff the buffer is invalid.
+ */
+#define BufferIsInvalid(buffer) ((buffer) == InvalidBuffer)
+
+/*
+ * BufferIsLocal
+ *		True iff the buffer is local (not visible to other backends).
+ */
+#define BufferIsLocal(buffer)	((buffer) < 0)
+
+/*
+ * Buffer access strategy objects.
+ *
+ * BufferAccessStrategyData is private to freelist.c
+ */
+typedef struct BufferAccessStrategyData *BufferAccessStrategy;
+
+#endif   /* BUF_H */
--- a/pg_include/storage/buf_internals.h
+++ b/pg_include/storage/buf_internals.h
@@ -0,0 +1,216 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf_internals.h
+ *	  Internal definitions for buffer manager and the buffer replacement
+ *	  strategy.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/buf_internals.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BUFMGR_INTERNALS_H
+#define BUFMGR_INTERNALS_H
+
+#include "storage/buf.h"
+#include "storage/latch.h"
+#include "storage/lwlock.h"
+#include "storage/shmem.h"
+#include "storage/smgr.h"
+#include "storage/spin.h"
+#include "utils/relcache.h"
+
+
+/*
+ * Flags for buffer descriptors
+ *
+ * Note: TAG_VALID essentially means that there is a buffer hashtable
+ * entry associated with the buffer's tag.
+ */
+#define BM_DIRTY				(1 << 0)		/* data needs writing */
+#define BM_VALID				(1 << 1)		/* data is valid */
+#define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
+#define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in progress */
+#define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
+#define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
+#define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
+#define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
+#define BM_PERMANENT			(1 << 8)		/* permanent relation (not
+												 * unlogged) */
+
+typedef bits16 BufFlags;
+
+/*
+ * The maximum allowed value of usage_count represents a tradeoff between
+ * accuracy and speed of the clock-sweep buffer management algorithm.  A
+ * large value (comparable to NBuffers) would approximate LRU semantics.
+ * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
+ * clock sweeps to find a free buffer, so in practice we don't want the
+ * value to be very large.
+ */
+#define BM_MAX_USAGE_COUNT	5
+
+/*
+ * Buffer tag identifies which disk block the buffer contains.
+ *
+ * Note: the BufferTag data must be sufficient to determine where to write the
+ * block, without reference to pg_class or pg_tablespace entries.  It's
+ * possible that the backend flushing the buffer doesn't even believe the
+ * relation is visible yet (its xact may have started before the xact that
+ * created the rel).  The storage manager must be able to cope anyway.
+ *
+ * Note: if there's any pad bytes in the struct, INIT_BUFFERTAG will have
+ * to be fixed to zero them, since this struct is used as a hash key.
+ */
+typedef struct buftag
+{
+	RelFileNode rnode;			/* physical relation identifier */
+	ForkNumber	forkNum;
+	BlockNumber blockNum;		/* blknum relative to begin of reln */
+} BufferTag;
+
+#define CLEAR_BUFFERTAG(a) \
+( \
+	(a).rnode.spcNode = InvalidOid, \
+	(a).rnode.dbNode = InvalidOid, \
+	(a).rnode.relNode = InvalidOid, \
+	(a).forkNum = InvalidForkNumber, \
+	(a).blockNum = InvalidBlockNumber \
+)
+
+#define INIT_BUFFERTAG(a,xx_rnode,xx_forkNum,xx_blockNum) \
+( \
+	(a).rnode = (xx_rnode), \
+	(a).forkNum = (xx_forkNum), \
+	(a).blockNum = (xx_blockNum) \
+)
+
+#define BUFFERTAGS_EQUAL(a,b) \
+( \
+	RelFileNodeEquals((a).rnode, (b).rnode) && \
+	(a).blockNum == (b).blockNum && \
+	(a).forkNum == (b).forkNum \
+)
+
+/*
+ * The shared buffer mapping table is partitioned to reduce contention.
+ * To determine which partition lock a given tag requires, compute the tag's
+ * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
+ * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
+ */
+#define BufTableHashPartition(hashcode) \
+	((hashcode) % NUM_BUFFER_PARTITIONS)
+#define BufMappingPartitionLock(hashcode) \
+	((LWLockId) (FirstBufMappingLock + BufTableHashPartition(hashcode)))
+
+/*
+ *	BufferDesc -- shared descriptor/state data for a single shared buffer.
+ *
+ * Note: buf_hdr_lock must be held to examine or change the tag, flags,
+ * usage_count, refcount, or wait_backend_pid fields.  buf_id field never
+ * changes after initialization, so does not need locking.	freeNext is
+ * protected by the BufFreelistLock not buf_hdr_lock.  The LWLocks can take
+ * care of themselves.	The buf_hdr_lock is *not* used to control access to
+ * the data in the buffer!
+ *
+ * An exception is that if we have the buffer pinned, its tag can't change
+ * underneath us, so we can examine the tag without locking the spinlock.
+ * Also, in places we do one-time reads of the flags without bothering to
+ * lock the spinlock; this is generally for situations where we don't expect
+ * the flag bit being tested to be changing.
+ *
+ * We can't physically remove items from a disk page if another backend has
+ * the buffer pinned.  Hence, a backend may need to wait for all other pins
+ * to go away.	This is signaled by storing its own PID into
+ * wait_backend_pid and setting flag bit BM_PIN_COUNT_WAITER.  At present,
+ * there can be only one such waiter per buffer.
+ *
+ * We use this same struct for local buffer headers, but the lock fields
+ * are not used and not all of the flag bits are useful either.
+ */
+typedef struct sbufdesc
+{
+	BufferTag	tag;			/* ID of page contained in buffer */
+	BufFlags	flags;			/* see bit definitions above */
+	uint16		usage_count;	/* usage counter for clock sweep code */
+	unsigned	refcount;		/* # of backends holding pins on buffer */
+	int			wait_backend_pid;		/* backend PID of pin-count waiter */
+
+	slock_t		buf_hdr_lock;	/* protects the above fields */
+
+	int			buf_id;			/* buffer's index number (from 0) */
+	int			freeNext;		/* link in freelist chain */
+
+	LWLockId	io_in_progress_lock;	/* to wait for I/O to complete */
+	LWLockId	content_lock;	/* to lock access to buffer contents */
+} BufferDesc;
+
+#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
+
+/*
+ * The freeNext field is either the index of the next freelist entry,
+ * or one of these special values:
+ */
+#define FREENEXT_END_OF_LIST	(-1)
+#define FREENEXT_NOT_IN_LIST	(-2)
+
+/*
+ * Macros for acquiring/releasing a shared buffer header's spinlock.
+ * Do not apply these to local buffers!
+ *
+ * Note: as a general coding rule, if you are using these then you probably
+ * need to be using a volatile-qualified pointer to the buffer header, to
+ * ensure that the compiler doesn't rearrange accesses to the header to
+ * occur before or after the spinlock is acquired/released.
+ */
+#define LockBufHdr(bufHdr)		SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
+#define UnlockBufHdr(bufHdr)	SpinLockRelease(&(bufHdr)->buf_hdr_lock)
+
+
+/* in buf_init.c */
+extern PGDLLIMPORT BufferDesc *BufferDescriptors;
+
+/* in localbuf.c */
+extern BufferDesc *LocalBufferDescriptors;
+
+
+/*
+ * Internal routines: only called by bufmgr
+ */
+
+/* freelist.c */
+extern volatile BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
+				  bool *lock_held);
+extern void StrategyFreeBuffer(volatile BufferDesc *buf);
+extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
+					 volatile BufferDesc *buf);
+
+extern int	StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
+extern void StrategyNotifyBgWriter(Latch *bgwriterLatch);
+
+extern Size StrategyShmemSize(void);
+extern void StrategyInitialize(bool init);
+
+/* buf_table.c */
+extern Size BufTableShmemSize(int size);
+extern void InitBufTable(int size);
+extern uint32 BufTableHashCode(BufferTag *tagPtr);
+extern int	BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
+extern int	BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
+extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
+
+/* localbuf.c */
+extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum,
+					BlockNumber blockNum);
+extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
+				 BlockNumber blockNum, bool *foundPtr);
+extern void MarkLocalBufferDirty(Buffer buffer);
+extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
+							BlockNumber firstDelBlock);
+extern void DropRelFileNodeAllLocalBuffers(RelFileNode rnode);
+extern void AtEOXact_LocalBuffers(bool isCommit);
+
+#endif   /* BUFMGR_INTERNALS_H */
--- a/pg_include/storage/buffile.h
+++ b/pg_include/storage/buffile.h
@@ -0,0 +1,45 @@
+/*-------------------------------------------------------------------------
+ *
+ * buffile.h
+ *	  Management of large buffered files, primarily temporary files.
+ *
+ * The BufFile routines provide a partial replacement for stdio atop
+ * virtual file descriptors managed by fd.c.  Currently they only support
+ * buffered access to a virtual file, without any of stdio's formatting
+ * features.  That's enough for immediate needs, but the set of facilities
+ * could be expanded if necessary.
+ *
+ * BufFile also supports working with temporary files that exceed the OS
+ * file size limit and/or the largest offset representable in an int.
+ * It might be better to split that out as a separately accessible module,
+ * but currently we have no need for oversize temp files without buffered
+ * access.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/buffile.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef BUFFILE_H
+#define BUFFILE_H
+
+/* BufFile is an opaque type whose details are not known outside buffile.c. */
+
+typedef struct BufFile BufFile;
+
+/*
+ * prototypes for functions in buffile.c
+ */
+
+extern BufFile *BufFileCreateTemp(bool interXact);
+extern void BufFileClose(BufFile *file);
+extern size_t BufFileRead(BufFile *file, void *ptr, size_t size);
+extern size_t BufFileWrite(BufFile *file, void *ptr, size_t size);
+extern int	BufFileSeek(BufFile *file, int fileno, off_t offset, int whence);
+extern void BufFileTell(BufFile *file, int *fileno, off_t *offset);
+extern int	BufFileSeekBlock(BufFile *file, long blknum);
+
+#endif   /* BUFFILE_H */
--- a/pg_include/storage/bufmgr.h
+++ b/pg_include/storage/bufmgr.h
@@ -0,0 +1,226 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufmgr.h
+ *	  POSTGRES buffer manager definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/bufmgr.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BUFMGR_H
+#define BUFMGR_H
+
+#include "storage/block.h"
+#include "storage/buf.h"
+#include "storage/bufpage.h"
+#include "storage/relfilenode.h"
+#include "utils/relcache.h"
+
+typedef void *Block;
+
+/* Possible arguments for GetAccessStrategy() */
+typedef enum BufferAccessStrategyType
+{
+	BAS_NORMAL,					/* Normal random access */
+	BAS_BULKREAD,				/* Large read-only scan (hint bit updates are
+								 * ok) */
+	BAS_BULKWRITE,				/* Large multi-block write (e.g. COPY IN) */
+	BAS_VACUUM					/* VACUUM */
+} BufferAccessStrategyType;
+
+/* Possible modes for ReadBufferExtended() */
+typedef enum
+{
+	RBM_NORMAL,					/* Normal read */
+	RBM_ZERO,					/* Don't read from disk, caller will
+								 * initialize */
+	RBM_ZERO_ON_ERROR			/* Read, but return an all-zeros page on error */
+} ReadBufferMode;
+
+/* in globals.c ... this duplicates miscadmin.h */
+extern PGDLLIMPORT int NBuffers;
+
+/* in bufmgr.c */
+extern bool zero_damaged_pages;
+extern int	bgwriter_lru_maxpages;
+extern double bgwriter_lru_multiplier;
+extern bool track_io_timing;
+extern int	target_prefetch_pages;
+
+/* in buf_init.c */
+extern PGDLLIMPORT char *BufferBlocks;
+extern PGDLLIMPORT int32 *PrivateRefCount;
+
+/* in localbuf.c */
+extern PGDLLIMPORT int NLocBuffer;
+extern PGDLLIMPORT Block *LocalBufferBlockPointers;
+extern PGDLLIMPORT int32 *LocalRefCount;
+
+/* special block number for ReadBuffer() */
+#define P_NEW	InvalidBlockNumber		/* grow the file to get a new page */
+
+/*
+ * Buffer content lock modes (mode argument for LockBuffer())
+ */
+#define BUFFER_LOCK_UNLOCK		0
+#define BUFFER_LOCK_SHARE		1
+#define BUFFER_LOCK_EXCLUSIVE	2
+
+/*
+ * These routines are beaten on quite heavily, hence the macroization.
+ */
+
+/*
+ * BufferIsValid
+ *		True iff the given buffer number is valid (either as a shared
+ *		or local buffer).
+ *
+ * Note: For a long time this was defined the same as BufferIsPinned,
+ * that is it would say False if you didn't hold a pin on the buffer.
+ * I believe this was bogus and served only to mask logic errors.
+ * Code should always know whether it has a buffer reference,
+ * independently of the pin state.
+ *
+ * Note: For a further long time this was not quite the inverse of the
+ * BufferIsInvalid() macro, in that it also did sanity checks to verify
+ * that the buffer number was in range.  Most likely, this macro was
+ * originally intended only to be used in assertions, but its use has
+ * since expanded quite a bit, and the overhead of making those checks
+ * even in non-assert-enabled builds can be significant.  Thus, we've
+ * now demoted the range checks to assertions within the macro itself.
+ */
+#define BufferIsValid(bufnum) \
+( \
+	AssertMacro((bufnum) <= NBuffers && (bufnum) >= -NLocBuffer), \
+	(bufnum) != InvalidBuffer  \
+)
+
+/*
+ * BufferIsPinned
+ *		True iff the buffer is pinned (also checks for valid buffer number).
+ *
+ *		NOTE: what we check here is that *this* backend holds a pin on
+ *		the buffer.  We do not care whether some other backend does.
+ */
+#define BufferIsPinned(bufnum) \
+( \
+	!BufferIsValid(bufnum) ? \
+		false \
+	: \
+		BufferIsLocal(bufnum) ? \
+			(LocalRefCount[-(bufnum) - 1] > 0) \
+		: \
+			(PrivateRefCount[(bufnum) - 1] > 0) \
+)
+
+/*
+ * BufferGetBlock
+ *		Returns a reference to a disk page image associated with a buffer.
+ *
+ * Note:
+ *		Assumes buffer is valid.
+ */
+#define BufferGetBlock(buffer) \
+( \
+	AssertMacro(BufferIsValid(buffer)), \
+	BufferIsLocal(buffer) ? \
+		LocalBufferBlockPointers[-(buffer) - 1] \
+	: \
+		(Block) (BufferBlocks + ((Size) ((buffer) - 1)) * BLCKSZ) \
+)
+
+/*
+ * BufferGetPageSize
+ *		Returns the page size within a buffer.
+ *
+ * Notes:
+ *		Assumes buffer is valid.
+ *
+ *		The buffer can be a raw disk block and need not contain a valid
+ *		(formatted) disk page.
+ */
+/* XXX should dig out of buffer descriptor */
+#define BufferGetPageSize(buffer) \
+( \
+	AssertMacro(BufferIsValid(buffer)), \
+	(Size)BLCKSZ \
+)
+
+/*
+ * BufferGetPage
+ *		Returns the page associated with a buffer.
+ */
+#define BufferGetPage(buffer) ((Page)BufferGetBlock(buffer))
+
+/*
+ * prototypes for functions in bufmgr.c
+ */
+extern void PrefetchBuffer(Relation reln, ForkNumber forkNum,
+			   BlockNumber blockNum);
+extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
+extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
+				   BlockNumber blockNum, ReadBufferMode mode,
+				   BufferAccessStrategy strategy);
+extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
+						  ForkNumber forkNum, BlockNumber blockNum,
+						  ReadBufferMode mode, BufferAccessStrategy strategy);
+extern void ReleaseBuffer(Buffer buffer);
+extern void UnlockReleaseBuffer(Buffer buffer);
+extern void MarkBufferDirty(Buffer buffer);
+extern void IncrBufferRefCount(Buffer buffer);
+extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
+					 BlockNumber blockNum);
+
+extern void InitBufferPool(void);
+extern void InitBufferPoolAccess(void);
+extern void InitBufferPoolBackend(void);
+extern void AtEOXact_Buffers(bool isCommit);
+extern void PrintBufferLeakWarning(Buffer buffer);
+extern void CheckPointBuffers(int flags);
+extern BlockNumber BufferGetBlockNumber(Buffer buffer);
+extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation,
+								ForkNumber forkNum);
+extern void FlushRelationBuffers(Relation rel);
+extern void FlushDatabaseBuffers(Oid dbid);
+extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
+					   ForkNumber forkNum, BlockNumber firstDelBlock);
+extern void DropRelFileNodeAllBuffers(RelFileNodeBackend rnode);
+extern void DropDatabaseBuffers(Oid dbid);
+
+#define RelationGetNumberOfBlocks(reln) \
+	RelationGetNumberOfBlocksInFork(reln, MAIN_FORKNUM)
+
+extern bool BufferIsPermanent(Buffer buffer);
+
+#ifdef NOT_USED
+extern void PrintPinnedBufs(void);
+#endif
+extern Size BufferShmemSize(void);
+extern void BufferGetTag(Buffer buffer, RelFileNode *rnode,
+			 ForkNumber *forknum, BlockNumber *blknum);
+
+extern void SetBufferCommitInfoNeedsSave(Buffer buffer);
+
+extern void UnlockBuffers(void);
+extern void LockBuffer(Buffer buffer, int mode);
+extern bool ConditionalLockBuffer(Buffer buffer);
+extern void LockBufferForCleanup(Buffer buffer);
+extern bool ConditionalLockBufferForCleanup(Buffer buffer);
+extern bool HoldingBufferPinThatDelaysRecovery(void);
+
+extern void AbortBufferIO(void);
+
+extern void BufmgrCommit(void);
+extern bool BgBufferSync(void);
+
+extern void AtProcExit_LocalBuffers(void);
+
+/* in freelist.c */
+extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype);
+extern void FreeAccessStrategy(BufferAccessStrategy strategy);
+
+#endif
--- a/pg_include/storage/bufpage.h
+++ b/pg_include/storage/bufpage.h
@@ -0,0 +1,385 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufpage.h
+ *	  Standard POSTGRES buffer page definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/bufpage.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BUFPAGE_H
+#define BUFPAGE_H
+
+#include "access/xlogdefs.h"
+#include "storage/item.h"
+#include "storage/off.h"
+
+/*
+ * A postgres disk page is an abstraction layered on top of a postgres
+ * disk block (which is simply a unit of i/o, see block.h).
+ *
+ * specifically, while a disk block can be unformatted, a postgres
+ * disk page is always a slotted page of the form:
+ *
+ * +----------------+---------------------------------+
+ * | PageHeaderData | linp1 linp2 linp3 ...			  |
+ * +-----------+----+---------------------------------+
+ * | ... linpN |									  |
+ * +-----------+--------------------------------------+
+ * |		   ^ pd_lower							  |
+ * |												  |
+ * |			 v pd_upper							  |
+ * +-------------+------------------------------------+
+ * |			 | tupleN ...						  |
+ * +-------------+------------------+-----------------+
+ * |	   ... tuple3 tuple2 tuple1 | "special space" |
+ * +--------------------------------+-----------------+
+ *									^ pd_special
+ *
+ * a page is full when nothing can be added between pd_lower and
+ * pd_upper.
+ *
+ * all blocks written out by an access method must be disk pages.
+ *
+ * EXCEPTIONS:
+ *
+ * obviously, a page is not formatted before it is initialized by
+ * a call to PageInit.
+ *
+ * NOTES:
+ *
+ * linp1..N form an ItemId array.  ItemPointers point into this array
+ * rather than pointing directly to a tuple.  Note that OffsetNumbers
+ * conventionally start at 1, not 0.
+ *
+ * tuple1..N are added "backwards" on the page.  because a tuple's
+ * ItemPointer points to its ItemId entry rather than its actual
+ * byte-offset position, tuples can be physically shuffled on a page
+ * whenever the need arises.
+ *
+ * AM-generic per-page information is kept in PageHeaderData.
+ *
+ * AM-specific per-page data (if any) is kept in the area marked "special
+ * space"; each AM has an "opaque" structure defined somewhere that is
+ * stored as the page trailer.	an access method should always
+ * initialize its pages with PageInit and then set its own opaque
+ * fields.
+ */
+
+typedef Pointer Page;
+
+
+/*
+ * location (byte offset) within a page.
+ *
+ * note that this is actually limited to 2^15 because we have limited
+ * ItemIdData.lp_off and ItemIdData.lp_len to 15 bits (see itemid.h).
+ */
+typedef uint16 LocationIndex;
+
+
+/*
+ * disk page organization
+ *
+ * space management information generic to any page
+ *
+ *		pd_lsn		- identifies xlog record for last change to this page.
+ *		pd_tli		- ditto.
+ *		pd_flags	- flag bits.
+ *		pd_lower	- offset to start of free space.
+ *		pd_upper	- offset to end of free space.
+ *		pd_special	- offset to start of special space.
+ *		pd_pagesize_version - size in bytes and page layout version number.
+ *		pd_prune_xid - oldest XID among potentially prunable tuples on page.
+ *
+ * The LSN is used by the buffer manager to enforce the basic rule of WAL:
+ * "thou shalt write xlog before data".  A dirty buffer cannot be dumped
+ * to disk until xlog has been flushed at least as far as the page's LSN.
+ * We also store the 16 least significant bits of the TLI for identification
+ * purposes (it is not clear that this is actually necessary, but it seems
+ * like a good idea).
+ *
+ * pd_prune_xid is a hint field that helps determine whether pruning will be
+ * useful.	It is currently unused in index pages.
+ *
+ * The page version number and page size are packed together into a single
+ * uint16 field.  This is for historical reasons: before PostgreSQL 7.3,
+ * there was no concept of a page version number, and doing it this way
+ * lets us pretend that pre-7.3 databases have page version number zero.
+ * We constrain page sizes to be multiples of 256, leaving the low eight
+ * bits available for a version number.
+ *
+ * Minimum possible page size is perhaps 64B to fit page header, opaque space
+ * and a minimal tuple; of course, in reality you want it much bigger, so
+ * the constraint on pagesize mod 256 is not an important restriction.
+ * On the high end, we can only support pages up to 32KB because lp_off/lp_len
+ * are 15 bits.
+ */
+typedef struct PageHeaderData
+{
+	/* XXX LSN is member of *any* block, not only page-organized ones */
+	XLogRecPtr	pd_lsn;			/* LSN: next byte after last byte of xlog
+								 * record for last change to this page */
+	uint16		pd_tli;			/* least significant bits of the TimeLineID
+								 * containing the LSN */
+	uint16		pd_flags;		/* flag bits, see below */
+	LocationIndex pd_lower;		/* offset to start of free space */
+	LocationIndex pd_upper;		/* offset to end of free space */
+	LocationIndex pd_special;	/* offset to start of special space */
+	uint16		pd_pagesize_version;
+	TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */
+	ItemIdData	pd_linp[1];		/* beginning of line pointer array */
+} PageHeaderData;
+
+typedef PageHeaderData *PageHeader;
+
+/*
+ * pd_flags contains the following flag bits.  Undefined bits are initialized
+ * to zero and may be used in the future.
+ *
+ * PD_HAS_FREE_LINES is set if there are any LP_UNUSED line pointers before
+ * pd_lower.  This should be considered a hint rather than the truth, since
+ * changes to it are not WAL-logged.
+ *
+ * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the
+ * page for its new tuple version; this suggests that a prune is needed.
+ * Again, this is just a hint.
+ */
+#define PD_HAS_FREE_LINES	0x0001		/* are there any unused line pointers? */
+#define PD_PAGE_FULL		0x0002		/* not enough free space for new
+										 * tuple? */
+#define PD_ALL_VISIBLE		0x0004		/* all tuples on page are visible to
+										 * everyone */
+
+#define PD_VALID_FLAG_BITS	0x0007		/* OR of all valid pd_flags bits */
+
+/*
+ * Page layout version number 0 is for pre-7.3 Postgres releases.
+ * Releases 7.3 and 7.4 use 1, denoting a new HeapTupleHeader layout.
+ * Release 8.0 uses 2; it changed the HeapTupleHeader layout again.
+ * Release 8.1 uses 3; it redefined HeapTupleHeader infomask bits.
+ * Release 8.3 uses 4; it changed the HeapTupleHeader layout again, and
+ *		added the pd_flags field (by stealing some bits from pd_tli),
+ *		as well as adding the pd_prune_xid field (which enlarges the header).
+ */
+#define PG_PAGE_LAYOUT_VERSION		4
+
+
+/* ----------------------------------------------------------------
+ *						page support macros
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * PageIsValid
+ *		True iff page is valid.
+ */
+#define PageIsValid(page) PointerIsValid(page)
+
+/*
+ * line pointer(s) do not count as part of header
+ */
+#define SizeOfPageHeaderData (offsetof(PageHeaderData, pd_linp))
+
+/*
+ * PageIsEmpty
+ *		returns true iff no itemid has been allocated on the page
+ */
+#define PageIsEmpty(page) \
+	(((PageHeader) (page))->pd_lower <= SizeOfPageHeaderData)
+
+/*
+ * PageIsNew
+ *		returns true iff page has not been initialized (by PageInit)
+ */
+#define PageIsNew(page) (((PageHeader) (page))->pd_upper == 0)
+
+/*
+ * PageGetItemId
+ *		Returns an item identifier of a page.
+ */
+#define PageGetItemId(page, offsetNumber) \
+	((ItemId) (&((PageHeader) (page))->pd_linp[(offsetNumber) - 1]))
+
+/*
+ * PageGetContents
+ *		To be used in case the page does not contain item pointers.
+ *
+ * Note: prior to 8.3 this was not guaranteed to yield a MAXALIGN'd result.
+ * Now it is.  Beware of old code that might think the offset to the contents
+ * is just SizeOfPageHeaderData rather than MAXALIGN(SizeOfPageHeaderData).
+ */
+#define PageGetContents(page) \
+	((char *) (page) + MAXALIGN(SizeOfPageHeaderData))
+
+/* ----------------
+ *		macros to access page size info
+ * ----------------
+ */
+
+/*
+ * PageSizeIsValid
+ *		True iff the page size is valid.
+ */
+#define PageSizeIsValid(pageSize) ((pageSize) == BLCKSZ)
+
+/*
+ * PageGetPageSize
+ *		Returns the page size of a page.
+ *
+ * this can only be called on a formatted page (unlike
+ * BufferGetPageSize, which can be called on an unformatted page).
+ * however, it can be called on a page that is not stored in a buffer.
+ */
+#define PageGetPageSize(page) \
+	((Size) (((PageHeader) (page))->pd_pagesize_version & (uint16) 0xFF00))
+
+/*
+ * PageGetPageLayoutVersion
+ *		Returns the page layout version of a page.
+ */
+#define PageGetPageLayoutVersion(page) \
+	(((PageHeader) (page))->pd_pagesize_version & 0x00FF)
+
+/*
+ * PageSetPageSizeAndVersion
+ *		Sets the page size and page layout version number of a page.
+ *
+ * We could support setting these two values separately, but there's
+ * no real need for it at the moment.
+ */
+#define PageSetPageSizeAndVersion(page, size, version) \
+( \
+	AssertMacro(((size) & 0xFF00) == (size)), \
+	AssertMacro(((version) & 0x00FF) == (version)), \
+	((PageHeader) (page))->pd_pagesize_version = (size) | (version) \
+)
+
+/* ----------------
+ *		page special data macros
+ * ----------------
+ */
+/*
+ * PageGetSpecialSize
+ *		Returns size of special space on a page.
+ */
+#define PageGetSpecialSize(page) \
+	((uint16) (PageGetPageSize(page) - ((PageHeader)(page))->pd_special))
+
+/*
+ * PageGetSpecialPointer
+ *		Returns pointer to special space on a page.
+ */
+#define PageGetSpecialPointer(page) \
+( \
+	AssertMacro(PageIsValid(page)), \
+	(char *) ((char *) (page) + ((PageHeader) (page))->pd_special) \
+)
+
+/*
+ * PageGetItem
+ *		Retrieves an item on the given page.
+ *
+ * Note:
+ *		This does not change the status of any of the resources passed.
+ *		The semantics may change in the future.
+ */
+#define PageGetItem(page, itemId) \
+( \
+	AssertMacro(PageIsValid(page)), \
+	AssertMacro(ItemIdHasStorage(itemId)), \
+	(Item)(((char *)(page)) + ItemIdGetOffset(itemId)) \
+)
+
+/*
+ * PageGetMaxOffsetNumber
+ *		Returns the maximum offset number used by the given page.
+ *		Since offset numbers are 1-based, this is also the number
+ *		of items on the page.
+ *
+ *		NOTE: if the page is not initialized (pd_lower == 0), we must
+ *		return zero to ensure sane behavior.  Accept double evaluation
+ *		of the argument so that we can ensure this.
+ */
+#define PageGetMaxOffsetNumber(page) \
+	(((PageHeader) (page))->pd_lower <= SizeOfPageHeaderData ? 0 : \
+	 ((((PageHeader) (page))->pd_lower - SizeOfPageHeaderData) \
+	  / sizeof(ItemIdData)))
+
+/*
+ * Additional macros for access to page headers
+ */
+#define PageGetLSN(page) \
+	(((PageHeader) (page))->pd_lsn)
+#define PageSetLSN(page, lsn) \
+	(((PageHeader) (page))->pd_lsn = (lsn))
+
+/* NOTE: only the 16 least significant bits are stored */
+#define PageGetTLI(page) \
+	(((PageHeader) (page))->pd_tli)
+#define PageSetTLI(page, tli) \
+	(((PageHeader) (page))->pd_tli = (uint16) (tli))
+
+#define PageHasFreeLinePointers(page) \
+	(((PageHeader) (page))->pd_flags & PD_HAS_FREE_LINES)
+#define PageSetHasFreeLinePointers(page) \
+	(((PageHeader) (page))->pd_flags |= PD_HAS_FREE_LINES)
+#define PageClearHasFreeLinePointers(page) \
+	(((PageHeader) (page))->pd_flags &= ~PD_HAS_FREE_LINES)
+
+#define PageIsFull(page) \
+	(((PageHeader) (page))->pd_flags & PD_PAGE_FULL)
+#define PageSetFull(page) \
+	(((PageHeader) (page))->pd_flags |= PD_PAGE_FULL)
+#define PageClearFull(page) \
+	(((PageHeader) (page))->pd_flags &= ~PD_PAGE_FULL)
+
+#define PageIsAllVisible(page) \
+	(((PageHeader) (page))->pd_flags & PD_ALL_VISIBLE)
+#define PageSetAllVisible(page) \
+	(((PageHeader) (page))->pd_flags |= PD_ALL_VISIBLE)
+#define PageClearAllVisible(page) \
+	(((PageHeader) (page))->pd_flags &= ~PD_ALL_VISIBLE)
+
+#define PageIsPrunable(page, oldestxmin) \
+( \
+	AssertMacro(TransactionIdIsNormal(oldestxmin)), \
+	TransactionIdIsValid(((PageHeader) (page))->pd_prune_xid) && \
+	TransactionIdPrecedes(((PageHeader) (page))->pd_prune_xid, oldestxmin) \
+)
+#define PageSetPrunable(page, xid) \
+do { \
+	Assert(TransactionIdIsNormal(xid)); \
+	if (!TransactionIdIsValid(((PageHeader) (page))->pd_prune_xid) || \
+		TransactionIdPrecedes(xid, ((PageHeader) (page))->pd_prune_xid)) \
+		((PageHeader) (page))->pd_prune_xid = (xid); \
+} while (0)
+#define PageClearPrunable(page) \
+	(((PageHeader) (page))->pd_prune_xid = InvalidTransactionId)
+
+
+/* ----------------------------------------------------------------
+ *		extern declarations
+ * ----------------------------------------------------------------
+ */
+
+extern void PageInit(Page page, Size pageSize, Size specialSize);
+extern bool PageHeaderIsValid(PageHeader page);
+extern OffsetNumber PageAddItem(Page page, Item item, Size size,
+			OffsetNumber offsetNumber, bool overwrite, bool is_heap);
+extern Page PageGetTempPage(Page page);
+extern Page PageGetTempPageCopy(Page page);
+extern Page PageGetTempPageCopySpecial(Page page);
+extern void PageRestoreTempPage(Page tempPage, Page oldPage);
+extern void PageRepairFragmentation(Page page);
+extern Size PageGetFreeSpace(Page page);
+extern Size PageGetExactFreeSpace(Page page);
+extern Size PageGetHeapFreeSpace(Page page);
+extern void PageIndexTupleDelete(Page page, OffsetNumber offset);
+extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems);
+
+#endif   /* BUFPAGE_H */
--- a/pg_include/storage/copydir.h
+++ b/pg_include/storage/copydir.h
@@ -0,0 +1,19 @@
+/*-------------------------------------------------------------------------
+ *
+ * copydir.h
+ *	  Copy a directory.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/copydir.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef COPYDIR_H
+#define COPYDIR_H
+
+extern void copydir(char *fromdir, char *todir, bool recurse);
+extern void copy_file(char *fromfile, char *tofile);
+
+#endif   /* COPYDIR_H */
--- a/pg_include/storage/fd.h
+++ b/pg_include/storage/fd.h
@@ -0,0 +1,112 @@
+/*-------------------------------------------------------------------------
+ *
+ * fd.h
+ *	  Virtual file descriptor definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/fd.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * calls:
+ *
+ *	File {Close, Read, Write, Seek, Tell, Sync}
+ *	{File Name Open, Allocate, Free} File
+ *
+ * These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
+ * Use them for all file activity...
+ *
+ *	File fd;
+ *	fd = FilePathOpenFile("foo", O_RDONLY, 0600);
+ *
+ *	AllocateFile();
+ *	FreeFile();
+ *
+ * Use AllocateFile, not fopen, if you need a stdio file (FILE*); then
+ * use FreeFile, not fclose, to close it.  AVOID using stdio for files
+ * that you intend to hold open for any length of time, since there is
+ * no way for them to share kernel file descriptors with other files.
+ *
+ * Likewise, use AllocateDir/FreeDir, not opendir/closedir, to allocate
+ * open directories (DIR*).
+ */
+#ifndef FD_H
+#define FD_H
+
+#include <dirent.h>
+
+
+/*
+ * FileSeek uses the standard UNIX lseek(2) flags.
+ */
+
+typedef char *FileName;
+
+typedef int File;
+
+
+/* GUC parameter */
+extern int	max_files_per_process;
+
+/*
+ * This is private to fd.c, but exported for save/restore_backend_variables()
+ */
+extern int	max_safe_fds;
+
+
+/*
+ * prototypes for functions in fd.c
+ */
+
+/* Operations on virtual Files --- equivalent to Unix kernel file ops */
+extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode);
+extern File OpenTemporaryFile(bool interXact);
+extern void FileClose(File file);
+extern int	FilePrefetch(File file, off_t offset, int amount);
+extern int	FileRead(File file, char *buffer, int amount);
+extern int	FileWrite(File file, char *buffer, int amount);
+extern int	FileSync(File file);
+extern off_t FileSeek(File file, off_t offset, int whence);
+extern int	FileTruncate(File file, off_t offset);
+extern char *FilePathName(File file);
+
+/* Operations that allow use of regular stdio --- USE WITH CAUTION */
+extern FILE *AllocateFile(const char *name, const char *mode);
+extern int	FreeFile(FILE *file);
+
+/* Operations to allow use of the <dirent.h> library routines */
+extern DIR *AllocateDir(const char *dirname);
+extern struct dirent *ReadDir(DIR *dir, const char *dirname);
+extern int	FreeDir(DIR *dir);
+
+/* If you've really really gotta have a plain kernel FD, use this */
+extern int	BasicOpenFile(FileName fileName, int fileFlags, int fileMode);
+
+/* Miscellaneous support routines */
+extern void InitFileAccess(void);
+extern void set_max_safe_fds(void);
+extern void closeAllVfds(void);
+extern void SetTempTablespaces(Oid *tableSpaces, int numSpaces);
+extern bool TempTablespacesAreSet(void);
+extern Oid	GetNextTempTableSpace(void);
+extern void AtEOXact_Files(void);
+extern void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
+				  SubTransactionId parentSubid);
+extern void RemovePgTempFiles(void);
+
+extern int	pg_fsync(int fd);
+extern int	pg_fsync_no_writethrough(int fd);
+extern int	pg_fsync_writethrough(int fd);
+extern int	pg_fdatasync(int fd);
+extern int	pg_flush_data(int fd, off_t offset, off_t amount);
+
+/* Filename components for OpenTemporaryFile */
+#define PG_TEMP_FILES_DIR "pgsql_tmp"
+#define PG_TEMP_FILE_PREFIX "pgsql_tmp"
+
+#endif   /* FD_H */
--- a/pg_include/storage/freespace.h
+++ b/pg_include/storage/freespace.h
@@ -0,0 +1,36 @@
+/*-------------------------------------------------------------------------
+ *
+ * freespace.h
+ *	  POSTGRES free space map for quickly finding free space in relations
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/freespace.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef FREESPACE_H_
+#define FREESPACE_H_
+
+#include "storage/block.h"
+#include "storage/relfilenode.h"
+#include "utils/relcache.h"
+
+/* prototypes for public functions in freespace.c */
+extern Size GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk);
+extern BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded);
+extern BlockNumber RecordAndGetPageWithFreeSpace(Relation rel,
+							  BlockNumber oldPage,
+							  Size oldSpaceAvail,
+							  Size spaceNeeded);
+extern void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk,
+						Size spaceAvail);
+extern void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
+							Size spaceAvail);
+
+extern void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks);
+extern void FreeSpaceMapVacuum(Relation rel);
+
+#endif   /* FREESPACE_H_ */
--- a/pg_include/storage/fsm_internals.h
+++ b/pg_include/storage/fsm_internals.h
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * fsm_internal.h
+ *	  internal functions for free space map
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/fsm_internals.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef FSM_INTERNALS_H
+#define FSM_INTERNALS_H
+
+#include "storage/buf.h"
+#include "storage/bufpage.h"
+
+/*
+ * Structure of a FSM page. See src/backend/storage/freespace/README for
+ * details.
+ */
+typedef struct
+{
+	/*
+	 * fsm_search_avail() tries to spread the load of multiple backends by
+	 * returning different pages to different backends in a round-robin
+	 * fashion. fp_next_slot points to the next slot to be returned (assuming
+	 * there's enough space on it for the request). It's defined as an int,
+	 * because it's updated without an exclusive lock. uint16 would be more
+	 * appropriate, but int is more likely to be atomically
+	 * fetchable/storable.
+	 */
+	int			fp_next_slot;
+
+	/*
+	 * fp_nodes contains the binary tree, stored in array. The first
+	 * NonLeafNodesPerPage elements are upper nodes, and the following
+	 * LeafNodesPerPage elements are leaf nodes. Unused nodes are zero.
+	 */
+	uint8		fp_nodes[1];
+} FSMPageData;
+
+typedef FSMPageData *FSMPage;
+
+/*
+ * Number of non-leaf and leaf nodes, and nodes in total, on an FSM page.
+ * These definitions are internal to fsmpage.c.
+ */
+#define NodesPerPage (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \
+					  offsetof(FSMPageData, fp_nodes))
+
+#define NonLeafNodesPerPage (BLCKSZ / 2 - 1)
+#define LeafNodesPerPage (NodesPerPage - NonLeafNodesPerPage)
+
+/*
+ * Number of FSM "slots" on a FSM page. This is what should be used
+ * outside fsmpage.c.
+ */
+#define SlotsPerFSMPage LeafNodesPerPage
+
+/* Prototypes for functions in fsmpage.c */
+extern int fsm_search_avail(Buffer buf, uint8 min_cat, bool advancenext,
+				 bool exclusive_lock_held);
+extern uint8 fsm_get_avail(Page page, int slot);
+extern uint8 fsm_get_max_avail(Page page);
+extern bool fsm_set_avail(Page page, int slot, uint8 value);
+extern bool fsm_truncate_avail(Page page, int nslots);
+extern bool fsm_rebuild_page(Page page);
+
+#endif   /* FSM_INTERNALS_H */
--- a/pg_include/storage/indexfsm.h
+++ b/pg_include/storage/indexfsm.h
@@ -0,0 +1,26 @@
+/*-------------------------------------------------------------------------
+ *
+ * indexfsm.h
+ *	  POSTGRES free space map for quickly finding an unused page in index
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/indexfsm.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef INDEXFSM_H_
+#define INDEXFSM_H_
+
+#include "storage/block.h"
+#include "utils/relcache.h"
+
+extern BlockNumber GetFreeIndexPage(Relation rel);
+extern void RecordFreeIndexPage(Relation rel, BlockNumber page);
+extern void RecordUsedIndexPage(Relation rel, BlockNumber page);
+
+extern void IndexFreeSpaceMapVacuum(Relation rel);
+
+#endif   /* INDEXFSM_H_ */
--- a/pg_include/storage/ipc.h
+++ b/pg_include/storage/ipc.h
@@ -0,0 +1,79 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipc.h
+ *	  POSTGRES inter-process communication definitions.
+ *
+ * This file is misnamed, as it no longer has much of anything directly
+ * to do with IPC.	The functionality here is concerned with managing
+ * exit-time cleanup for either a postmaster or a backend.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/ipc.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef IPC_H
+#define IPC_H
+
+typedef void (*pg_on_exit_callback) (int code, Datum arg);
+typedef void (*shmem_startup_hook_type) (void);
+
+/*----------
+ * API for handling cleanup that must occur during either ereport(ERROR)
+ * or ereport(FATAL) exits from a block of code.  (Typical examples are
+ * undoing transient changes to shared-memory state.)
+ *
+ *		PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg);
+ *		{
+ *			... code that might throw ereport(ERROR) or ereport(FATAL) ...
+ *		}
+ *		PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg);
+ *
+ * where the cleanup code is in a function declared per pg_on_exit_callback.
+ * The Datum value "arg" can carry any information the cleanup function
+ * needs.
+ *
+ * This construct ensures that cleanup_function() will be called during
+ * either ERROR or FATAL exits.  It will not be called on successful
+ * exit from the controlled code.  (If you want it to happen then too,
+ * call the function yourself from just after the construct.)
+ *
+ * Note: the macro arguments are multiply evaluated, so avoid side-effects.
+ *----------
+ */
+#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)	\
+	do { \
+		on_shmem_exit(cleanup_function, arg); \
+		PG_TRY()
+
+#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)	\
+		cancel_shmem_exit(cleanup_function, arg); \
+		PG_CATCH(); \
+		{ \
+			cancel_shmem_exit(cleanup_function, arg); \
+			cleanup_function (0, arg); \
+			PG_RE_THROW(); \
+		} \
+		PG_END_TRY(); \
+	} while (0)
+
+
+/* ipc.c */
+extern bool proc_exit_inprogress;
+
+extern void proc_exit(int code);
+extern void shmem_exit(int code);
+extern void on_proc_exit(pg_on_exit_callback function, Datum arg);
+extern void on_shmem_exit(pg_on_exit_callback function, Datum arg);
+extern void cancel_shmem_exit(pg_on_exit_callback function, Datum arg);
+extern void on_exit_reset(void);
+
+/* ipci.c */
+extern PGDLLIMPORT shmem_startup_hook_type shmem_startup_hook;
+
+extern void CreateSharedMemoryAndSemaphores(bool makePrivate, int port);
+
+#endif   /* IPC_H */
--- a/pg_include/storage/item.h
+++ b/pg_include/storage/item.h
@@ -0,0 +1,19 @@
+/*-------------------------------------------------------------------------
+ *
+ * item.h
+ *	  POSTGRES disk item definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/item.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ITEM_H
+#define ITEM_H
+
+typedef Pointer Item;
+
+#endif   /* ITEM_H */
--- a/pg_include/storage/itemid.h
+++ b/pg_include/storage/itemid.h
@@ -0,0 +1,183 @@
+/*-------------------------------------------------------------------------
+ *
+ * itemid.h
+ *	  Standard POSTGRES buffer page item identifier definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/itemid.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ITEMID_H
+#define ITEMID_H
+
+/*
+ * An item pointer (also called line pointer) on a buffer page
+ *
+ * In some cases an item pointer is "in use" but does not have any associated
+ * storage on the page.  By convention, lp_len == 0 in every item pointer
+ * that does not have storage, independently of its lp_flags state.
+ */
+typedef struct ItemIdData
+{
+	unsigned	lp_off:15,		/* offset to tuple (from start of page) */
+				lp_flags:2,		/* state of item pointer, see below */
+				lp_len:15;		/* byte length of tuple */
+} ItemIdData;
+
+typedef ItemIdData *ItemId;
+
+/*
+ * lp_flags has these possible states.	An UNUSED line pointer is available
+ * for immediate re-use, the other states are not.
+ */
+#define LP_UNUSED		0		/* unused (should always have lp_len=0) */
+#define LP_NORMAL		1		/* used (should always have lp_len>0) */
+#define LP_REDIRECT		2		/* HOT redirect (should have lp_len=0) */
+#define LP_DEAD			3		/* dead, may or may not have storage */
+
+/*
+ * Item offsets and lengths are represented by these types when
+ * they're not actually stored in an ItemIdData.
+ */
+typedef uint16 ItemOffset;
+typedef uint16 ItemLength;
+
+
+/* ----------------
+ *		support macros
+ * ----------------
+ */
+
+/*
+ *		ItemIdGetLength
+ */
+#define ItemIdGetLength(itemId) \
+   ((itemId)->lp_len)
+
+/*
+ *		ItemIdGetOffset
+ */
+#define ItemIdGetOffset(itemId) \
+   ((itemId)->lp_off)
+
+/*
+ *		ItemIdGetFlags
+ */
+#define ItemIdGetFlags(itemId) \
+   ((itemId)->lp_flags)
+
+/*
+ *		ItemIdGetRedirect
+ * In a REDIRECT pointer, lp_off holds the link to the next item pointer
+ */
+#define ItemIdGetRedirect(itemId) \
+   ((itemId)->lp_off)
+
+/*
+ * ItemIdIsValid
+ *		True iff item identifier is valid.
+ *		This is a pretty weak test, probably useful only in Asserts.
+ */
+#define ItemIdIsValid(itemId)	PointerIsValid(itemId)
+
+/*
+ * ItemIdIsUsed
+ *		True iff item identifier is in use.
+ */
+#define ItemIdIsUsed(itemId) \
+	((itemId)->lp_flags != LP_UNUSED)
+
+/*
+ * ItemIdIsNormal
+ *		True iff item identifier is in state NORMAL.
+ */
+#define ItemIdIsNormal(itemId) \
+	((itemId)->lp_flags == LP_NORMAL)
+
+/*
+ * ItemIdIsRedirected
+ *		True iff item identifier is in state REDIRECT.
+ */
+#define ItemIdIsRedirected(itemId) \
+	((itemId)->lp_flags == LP_REDIRECT)
+
+/*
+ * ItemIdIsDead
+ *		True iff item identifier is in state DEAD.
+ */
+#define ItemIdIsDead(itemId) \
+	((itemId)->lp_flags == LP_DEAD)
+
+/*
+ * ItemIdHasStorage
+ *		True iff item identifier has associated storage.
+ */
+#define ItemIdHasStorage(itemId) \
+	((itemId)->lp_len != 0)
+
+/*
+ * ItemIdSetUnused
+ *		Set the item identifier to be UNUSED, with no storage.
+ *		Beware of multiple evaluations of itemId!
+ */
+#define ItemIdSetUnused(itemId) \
+( \
+	(itemId)->lp_flags = LP_UNUSED, \
+	(itemId)->lp_off = 0, \
+	(itemId)->lp_len = 0 \
+)
+
+/*
+ * ItemIdSetNormal
+ *		Set the item identifier to be NORMAL, with the specified storage.
+ *		Beware of multiple evaluations of itemId!
+ */
+#define ItemIdSetNormal(itemId, off, len) \
+( \
+	(itemId)->lp_flags = LP_NORMAL, \
+	(itemId)->lp_off = (off), \
+	(itemId)->lp_len = (len) \
+)
+
+/*
+ * ItemIdSetRedirect
+ *		Set the item identifier to be REDIRECT, with the specified link.
+ *		Beware of multiple evaluations of itemId!
+ */
+#define ItemIdSetRedirect(itemId, link) \
+( \
+	(itemId)->lp_flags = LP_REDIRECT, \
+	(itemId)->lp_off = (link), \
+	(itemId)->lp_len = 0 \
+)
+
+/*
+ * ItemIdSetDead
+ *		Set the item identifier to be DEAD, with no storage.
+ *		Beware of multiple evaluations of itemId!
+ */
+#define ItemIdSetDead(itemId) \
+( \
+	(itemId)->lp_flags = LP_DEAD, \
+	(itemId)->lp_off = 0, \
+	(itemId)->lp_len = 0 \
+)
+
+/*
+ * ItemIdMarkDead
+ *		Set the item identifier to be DEAD, keeping its existing storage.
+ *
+ * Note: in indexes, this is used as if it were a hint-bit mechanism;
+ * we trust that multiple processors can do this in parallel and get
+ * the same result.
+ */
+#define ItemIdMarkDead(itemId) \
+( \
+	(itemId)->lp_flags = LP_DEAD \
+)
+
+#endif   /* ITEMID_H */
--- a/pg_include/storage/itemptr.h
+++ b/pg_include/storage/itemptr.h
@@ -0,0 +1,146 @@
+/*-------------------------------------------------------------------------
+ *
+ * itemptr.h
+ *	  POSTGRES disk item pointer definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/itemptr.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ITEMPTR_H
+#define ITEMPTR_H
+
+#include "storage/block.h"
+#include "storage/off.h"
+
+/*
+ * ItemPointer:
+ *
+ * This is a pointer to an item within a disk page of a known file
+ * (for example, a cross-link from an index to its parent table).
+ * blkid tells us which block, posid tells us which entry in the linp
+ * (ItemIdData) array we want.
+ *
+ * Note: because there is an item pointer in each tuple header and index
+ * tuple header on disk, it's very important not to waste space with
+ * structure padding bytes.  The struct is designed to be six bytes long
+ * (it contains three int16 fields) but a few compilers will pad it to
+ * eight bytes unless coerced.	We apply appropriate persuasion where
+ * possible, and to cope with unpersuadable compilers, we try to use
+ * "SizeOfIptrData" rather than "sizeof(ItemPointerData)" when computing
+ * on-disk sizes.
+ */
+typedef struct ItemPointerData
+{
+	BlockIdData ip_blkid;
+	OffsetNumber ip_posid;
+}
+
+#ifdef __arm__
+__attribute__((packed))			/* Appropriate whack upside the head for ARM */
+#endif
+ItemPointerData;
+
+#define SizeOfIptrData	\
+	(offsetof(ItemPointerData, ip_posid) + sizeof(OffsetNumber))
+
+typedef ItemPointerData *ItemPointer;
+
+/* ----------------
+ *		support macros
+ * ----------------
+ */
+
+/*
+ * ItemPointerIsValid
+ *		True iff the disk item pointer is not NULL.
+ */
+#define ItemPointerIsValid(pointer) \
+	((bool) (PointerIsValid(pointer) && ((pointer)->ip_posid != 0)))
+
+/*
+ * ItemPointerGetBlockNumber
+ *		Returns the block number of a disk item pointer.
+ */
+#define ItemPointerGetBlockNumber(pointer) \
+( \
+	AssertMacro(ItemPointerIsValid(pointer)), \
+	BlockIdGetBlockNumber(&(pointer)->ip_blkid) \
+)
+
+/*
+ * ItemPointerGetOffsetNumber
+ *		Returns the offset number of a disk item pointer.
+ */
+#define ItemPointerGetOffsetNumber(pointer) \
+( \
+	AssertMacro(ItemPointerIsValid(pointer)), \
+	(pointer)->ip_posid \
+)
+
+/*
+ * ItemPointerSet
+ *		Sets a disk item pointer to the specified block and offset.
+ */
+#define ItemPointerSet(pointer, blockNumber, offNum) \
+( \
+	AssertMacro(PointerIsValid(pointer)), \
+	BlockIdSet(&((pointer)->ip_blkid), blockNumber), \
+	(pointer)->ip_posid = offNum \
+)
+
+/*
+ * ItemPointerSetBlockNumber
+ *		Sets a disk item pointer to the specified block.
+ */
+#define ItemPointerSetBlockNumber(pointer, blockNumber) \
+( \
+	AssertMacro(PointerIsValid(pointer)), \
+	BlockIdSet(&((pointer)->ip_blkid), blockNumber) \
+)
+
+/*
+ * ItemPointerSetOffsetNumber
+ *		Sets a disk item pointer to the specified offset.
+ */
+#define ItemPointerSetOffsetNumber(pointer, offsetNumber) \
+( \
+	AssertMacro(PointerIsValid(pointer)), \
+	(pointer)->ip_posid = (offsetNumber) \
+)
+
+/*
+ * ItemPointerCopy
+ *		Copies the contents of one disk item pointer to another.
+ */
+#define ItemPointerCopy(fromPointer, toPointer) \
+( \
+	AssertMacro(PointerIsValid(toPointer)), \
+	AssertMacro(PointerIsValid(fromPointer)), \
+	*(toPointer) = *(fromPointer) \
+)
+
+/*
+ * ItemPointerSetInvalid
+ *		Sets a disk item pointer to be invalid.
+ */
+#define ItemPointerSetInvalid(pointer) \
+( \
+	AssertMacro(PointerIsValid(pointer)), \
+	BlockIdSet(&((pointer)->ip_blkid), InvalidBlockNumber), \
+	(pointer)->ip_posid = InvalidOffsetNumber \
+)
+
+/* ----------------
+ *		externs
+ * ----------------
+ */
+
+extern bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2);
+extern int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2);
+
+#endif   /* ITEMPTR_H */
--- a/pg_include/storage/large_object.h
+++ b/pg_include/storage/large_object.h
@@ -0,0 +1,83 @@
+/*-------------------------------------------------------------------------
+ *
+ * large_object.h
+ *	  Declarations for PostgreSQL large objects.  POSTGRES 4.2 supported
+ *	  zillions of large objects (internal, external, jaquith, inversion).
+ *	  Now we only support inversion.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/large_object.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LARGE_OBJECT_H
+#define LARGE_OBJECT_H
+
+#include "utils/snapshot.h"
+
+
+/*----------
+ * Data about a currently-open large object.
+ *
+ * id is the logical OID of the large object
+ * snapshot is the snapshot to use for read/write operations
+ * subid is the subtransaction that opened the desc (or currently owns it)
+ * offset is the current seek offset within the LO
+ * flags contains some flag bits
+ *
+ * NOTE: before 7.1, we also had to store references to the separate table
+ * and index of a specific large object.  Now they all live in pg_largeobject
+ * and are accessed via a common relation descriptor.
+ *----------
+ */
+typedef struct LargeObjectDesc
+{
+	Oid			id;				/* LO's identifier */
+	Snapshot	snapshot;		/* snapshot to use */
+	SubTransactionId subid;		/* owning subtransaction ID */
+	uint32		offset;			/* current seek pointer */
+	int			flags;			/* locking info, etc */
+
+/* flag bits: */
+#define IFS_RDLOCK		(1 << 0)
+#define IFS_WRLOCK		(1 << 1)
+
+} LargeObjectDesc;
+
+
+/*
+ * Each "page" (tuple) of a large object can hold this much data
+ *
+ * We could set this as high as BLCKSZ less some overhead, but it seems
+ * better to make it a smaller value, so that not as much space is used
+ * up when a page-tuple is updated.  Note that the value is deliberately
+ * chosen large enough to trigger the tuple toaster, so that we will
+ * attempt to compress page tuples in-line.  (But they won't be moved off
+ * unless the user creates a toast-table for pg_largeobject...)
+ *
+ * Also, it seems to be a smart move to make the page size be a power of 2,
+ * since clients will often be written to send data in power-of-2 blocks.
+ * This avoids unnecessary tuple updates caused by partial-page writes.
+ */
+#define LOBLKSIZE		(BLCKSZ / 4)
+
+
+/*
+ * Function definitions...
+ */
+
+/* inversion stuff in inv_api.c */
+extern void close_lo_relation(bool isCommit);
+extern Oid	inv_create(Oid lobjId);
+extern LargeObjectDesc *inv_open(Oid lobjId, int flags, MemoryContext mcxt);
+extern void inv_close(LargeObjectDesc *obj_desc);
+extern int	inv_drop(Oid lobjId);
+extern int	inv_seek(LargeObjectDesc *obj_desc, int offset, int whence);
+extern int	inv_tell(LargeObjectDesc *obj_desc);
+extern int	inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes);
+extern int	inv_write(LargeObjectDesc *obj_desc, const char *buf, int nbytes);
+extern void inv_truncate(LargeObjectDesc *obj_desc, int len);
+
+#endif   /* LARGE_OBJECT_H */
--- a/pg_include/storage/latch.h
+++ b/pg_include/storage/latch.h
@@ -0,0 +1,139 @@
+/*-------------------------------------------------------------------------
+ *
+ * latch.h
+ *	  Routines for interprocess latches
+ *
+ * A latch is a boolean variable, with operations that let processes sleep
+ * until it is set. A latch can be set from another process, or a signal
+ * handler within the same process.
+ *
+ * The latch interface is a reliable replacement for the common pattern of
+ * using pg_usleep() or select() to wait until a signal arrives, where the
+ * signal handler sets a flag variable. Because on some platforms an
+ * incoming signal doesn't interrupt sleep, and even on platforms where it
+ * does there is a race condition if the signal arrives just before
+ * entering the sleep, the common pattern must periodically wake up and
+ * poll the flag variable. The pselect() system call was invented to solve
+ * this problem, but it is not portable enough. Latches are designed to
+ * overcome these limitations, allowing you to sleep without polling and
+ * ensuring quick response to signals from other processes.
+ *
+ * There are two kinds of latches: local and shared. A local latch is
+ * initialized by InitLatch, and can only be set from the same process.
+ * A local latch can be used to wait for a signal to arrive, by calling
+ * SetLatch in the signal handler. A shared latch resides in shared memory,
+ * and must be initialized at postmaster startup by InitSharedLatch. Before
+ * a shared latch can be waited on, it must be associated with a process
+ * with OwnLatch. Only the process owning the latch can wait on it, but any
+ * process can set it.
+ *
+ * There are three basic operations on a latch:
+ *
+ * SetLatch		- Sets the latch
+ * ResetLatch	- Clears the latch, allowing it to be set again
+ * WaitLatch	- Waits for the latch to become set
+ *
+ * WaitLatch includes a provision for timeouts (which should be avoided
+ * when possible, as they incur extra overhead) and a provision for
+ * postmaster child processes to wake up immediately on postmaster death.
+ * See unix_latch.c for detailed specifications for the exported functions.
+ *
+ * The correct pattern to wait for event(s) is:
+ *
+ * for (;;)
+ * {
+ *	   ResetLatch();
+ *	   if (work to do)
+ *		   Do Stuff();
+ *	   WaitLatch();
+ * }
+ *
+ * It's important to reset the latch *before* checking if there's work to
+ * do. Otherwise, if someone sets the latch between the check and the
+ * ResetLatch call, you will miss it and Wait will incorrectly block.
+ *
+ * To wake up the waiter, you must first set a global flag or something
+ * else that the wait loop tests in the "if (work to do)" part, and call
+ * SetLatch *after* that. SetLatch is designed to return quickly if the
+ * latch is already set.
+ *
+ * Presently, when using a shared latch for interprocess signalling, the
+ * flag variable(s) set by senders and inspected by the wait loop must
+ * be protected by spinlocks or LWLocks, else it is possible to miss events
+ * on machines with weak memory ordering (such as PPC).  This restriction
+ * will be lifted in future by inserting suitable memory barriers into
+ * SetLatch and ResetLatch.
+ *
+ * On some platforms, signals will not interrupt the latch wait primitive
+ * by themselves.  Therefore, it is critical that any signal handler that
+ * is meant to terminate a WaitLatch wait calls SetLatch.
+ *
+ * Note that use of the process latch (PGPROC.procLatch) is generally better
+ * than an ad-hoc shared latch for signaling auxiliary processes.  This is
+ * because generic signal handlers will call SetLatch on the process latch
+ * only, so using any latch other than the process latch effectively precludes
+ * use of any generic handler.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/latch.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LATCH_H
+#define LATCH_H
+
+#include <signal.h>
+
+/*
+ * Latch structure should be treated as opaque and only accessed through
+ * the public functions. It is defined here to allow embedding Latches as
+ * part of bigger structs.
+ */
+typedef struct
+{
+	sig_atomic_t is_set;
+	bool		is_shared;
+	int			owner_pid;
+#ifdef WIN32
+	HANDLE		event;
+#endif
+} Latch;
+
+/* Bitmasks for events that may wake-up WaitLatch() clients */
+#define WL_LATCH_SET		 (1 << 0)
+#define WL_SOCKET_READABLE	 (1 << 1)
+#define WL_SOCKET_WRITEABLE  (1 << 2)
+#define WL_TIMEOUT			 (1 << 3)
+#define WL_POSTMASTER_DEATH  (1 << 4)
+
+/*
+ * prototypes for functions in latch.c
+ */
+extern void InitializeLatchSupport(void);
+extern void InitLatch(volatile Latch *latch);
+extern void InitSharedLatch(volatile Latch *latch);
+extern void OwnLatch(volatile Latch *latch);
+extern void DisownLatch(volatile Latch *latch);
+extern int	WaitLatch(volatile Latch *latch, int wakeEvents, long timeout);
+extern int WaitLatchOrSocket(volatile Latch *latch, int wakeEvents,
+				  pgsocket sock, long timeout);
+extern void SetLatch(volatile Latch *latch);
+extern void ResetLatch(volatile Latch *latch);
+
+/* beware of memory ordering issues if you use this macro! */
+#define TestLatch(latch) (((volatile Latch *) (latch))->is_set)
+
+/*
+ * Unix implementation uses SIGUSR1 for inter-process signaling.
+ * Win32 doesn't need this.
+ */
+#ifndef WIN32
+extern void latch_sigusr1_handler(void);
+#else
+#define latch_sigusr1_handler()  ((void) 0)
+#endif
+
+#endif   /* LATCH_H */
--- a/pg_include/storage/lmgr.h
+++ b/pg_include/storage/lmgr.h
@@ -0,0 +1,80 @@
+/*-------------------------------------------------------------------------
+ *
+ * lmgr.h
+ *	  POSTGRES lock manager definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/lmgr.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LMGR_H
+#define LMGR_H
+
+#include "lib/stringinfo.h"
+#include "storage/itemptr.h"
+#include "storage/lock.h"
+#include "utils/rel.h"
+
+
+extern void RelationInitLockInfo(Relation relation);
+
+/* Lock a relation */
+extern void LockRelationOid(Oid relid, LOCKMODE lockmode);
+extern bool ConditionalLockRelationOid(Oid relid, LOCKMODE lockmode);
+extern void UnlockRelationId(LockRelId *relid, LOCKMODE lockmode);
+extern void UnlockRelationOid(Oid relid, LOCKMODE lockmode);
+
+extern void LockRelation(Relation relation, LOCKMODE lockmode);
+extern bool ConditionalLockRelation(Relation relation, LOCKMODE lockmode);
+extern void UnlockRelation(Relation relation, LOCKMODE lockmode);
+extern bool LockHasWaitersRelation(Relation relation, LOCKMODE lockmode);
+
+extern void LockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode);
+extern void UnlockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode);
+
+/* Lock a relation for extension */
+extern void LockRelationForExtension(Relation relation, LOCKMODE lockmode);
+extern void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode);
+
+/* Lock a page (currently only used within indexes) */
+extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
+extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
+extern void UnlockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
+
+/* Lock a tuple (see heap_lock_tuple before assuming you understand this) */
+extern void LockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode);
+extern bool ConditionalLockTuple(Relation relation, ItemPointer tid,
+					 LOCKMODE lockmode);
+extern void UnlockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode);
+
+/* Lock an XID (used to wait for a transaction to finish) */
+extern void XactLockTableInsert(TransactionId xid);
+extern void XactLockTableDelete(TransactionId xid);
+extern void XactLockTableWait(TransactionId xid);
+extern bool ConditionalXactLockTableWait(TransactionId xid);
+
+/* Lock a general object (other than a relation) of the current database */
+extern void LockDatabaseObject(Oid classid, Oid objid, uint16 objsubid,
+				   LOCKMODE lockmode);
+extern void UnlockDatabaseObject(Oid classid, Oid objid, uint16 objsubid,
+					 LOCKMODE lockmode);
+
+/* Lock a shared-across-databases object (other than a relation) */
+extern void LockSharedObject(Oid classid, Oid objid, uint16 objsubid,
+				 LOCKMODE lockmode);
+extern void UnlockSharedObject(Oid classid, Oid objid, uint16 objsubid,
+				   LOCKMODE lockmode);
+
+extern void LockSharedObjectForSession(Oid classid, Oid objid, uint16 objsubid,
+						   LOCKMODE lockmode);
+extern void UnlockSharedObjectForSession(Oid classid, Oid objid, uint16 objsubid,
+							 LOCKMODE lockmode);
+
+/* Describe a locktag for error messages */
+extern void DescribeLockTag(StringInfo buf, const LOCKTAG *tag);
+
+#endif   /* LMGR_H */
--- a/pg_include/storage/lock.h
+++ b/pg_include/storage/lock.h
@@ -0,0 +1,552 @@
+/*-------------------------------------------------------------------------
+ *
+ * lock.h
+ *	  POSTGRES low-level lock mechanism
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/lock.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LOCK_H_
+#define LOCK_H_
+
+#include "storage/backendid.h"
+#include "storage/lwlock.h"
+#include "storage/shmem.h"
+
+
+/* struct PGPROC is declared in proc.h, but must forward-reference it */
+typedef struct PGPROC PGPROC;
+
+typedef struct PROC_QUEUE
+{
+	SHM_QUEUE	links;			/* head of list of PGPROC objects */
+	int			size;			/* number of entries in list */
+} PROC_QUEUE;
+
+/* GUC variables */
+extern int	max_locks_per_xact;
+
+#ifdef LOCK_DEBUG
+extern int	Trace_lock_oidmin;
+extern bool Trace_locks;
+extern bool Trace_userlocks;
+extern int	Trace_lock_table;
+extern bool Debug_deadlocks;
+#endif   /* LOCK_DEBUG */
+
+
+/*
+ * Top-level transactions are identified by VirtualTransactionIDs comprising
+ * the BackendId of the backend running the xact, plus a locally-assigned
+ * LocalTransactionId.	These are guaranteed unique over the short term,
+ * but will be reused after a database restart; hence they should never
+ * be stored on disk.
+ *
+ * Note that struct VirtualTransactionId can not be assumed to be atomically
+ * assignable as a whole.  However, type LocalTransactionId is assumed to
+ * be atomically assignable, and the backend ID doesn't change often enough
+ * to be a problem, so we can fetch or assign the two fields separately.
+ * We deliberately refrain from using the struct within PGPROC, to prevent
+ * coding errors from trying to use struct assignment with it; instead use
+ * GET_VXID_FROM_PGPROC().
+ */
+typedef struct
+{
+	BackendId	backendId;		/* determined at backend startup */
+	LocalTransactionId localTransactionId;		/* backend-local transaction
+												 * id */
+} VirtualTransactionId;
+
+#define InvalidLocalTransactionId		0
+#define LocalTransactionIdIsValid(lxid) ((lxid) != InvalidLocalTransactionId)
+#define VirtualTransactionIdIsValid(vxid) \
+	(((vxid).backendId != InvalidBackendId) && \
+	 LocalTransactionIdIsValid((vxid).localTransactionId))
+#define VirtualTransactionIdEquals(vxid1, vxid2) \
+	((vxid1).backendId == (vxid2).backendId && \
+	 (vxid1).localTransactionId == (vxid2).localTransactionId)
+#define SetInvalidVirtualTransactionId(vxid) \
+	((vxid).backendId = InvalidBackendId, \
+	 (vxid).localTransactionId = InvalidLocalTransactionId)
+#define GET_VXID_FROM_PGPROC(vxid, proc) \
+	((vxid).backendId = (proc).backendId, \
+	 (vxid).localTransactionId = (proc).lxid)
+
+
+/*
+ * LOCKMODE is an integer (1..N) indicating a lock type.  LOCKMASK is a bit
+ * mask indicating a set of held or requested lock types (the bit 1<<mode
+ * corresponds to a particular lock mode).
+ */
+typedef int LOCKMASK;
+typedef int LOCKMODE;
+
+/* MAX_LOCKMODES cannot be larger than the # of bits in LOCKMASK */
+#define MAX_LOCKMODES		10
+
+#define LOCKBIT_ON(lockmode) (1 << (lockmode))
+#define LOCKBIT_OFF(lockmode) (~(1 << (lockmode)))
+
+
+/*
+ * This data structure defines the locking semantics associated with a
+ * "lock method".  The semantics specify the meaning of each lock mode
+ * (by defining which lock modes it conflicts with).
+ * All of this data is constant and is kept in const tables.
+ *
+ * numLockModes -- number of lock modes (READ,WRITE,etc) that
+ *		are defined in this lock method.  Must be less than MAX_LOCKMODES.
+ *
+ * conflictTab -- this is an array of bitmasks showing lock
+ *		mode conflicts.  conflictTab[i] is a mask with the j-th bit
+ *		turned on if lock modes i and j conflict.  Lock modes are
+ *		numbered 1..numLockModes; conflictTab[0] is unused.
+ *
+ * lockModeNames -- ID strings for debug printouts.
+ *
+ * trace_flag -- pointer to GUC trace flag for this lock method.  (The
+ * GUC variable is not constant, but we use "const" here to denote that
+ * it can't be changed through this reference.)
+ */
+typedef struct LockMethodData
+{
+	int			numLockModes;
+	const LOCKMASK *conflictTab;
+	const char *const * lockModeNames;
+	const bool *trace_flag;
+} LockMethodData;
+
+typedef const LockMethodData *LockMethod;
+
+/*
+ * Lock methods are identified by LOCKMETHODID.  (Despite the declaration as
+ * uint16, we are constrained to 256 lockmethods by the layout of LOCKTAG.)
+ */
+typedef uint16 LOCKMETHODID;
+
+/* These identify the known lock methods */
+#define DEFAULT_LOCKMETHOD	1
+#define USER_LOCKMETHOD		2
+
+/*
+ * These are the valid values of type LOCKMODE for all the standard lock
+ * methods (both DEFAULT and USER).
+ */
+
+/* NoLock is not a lock mode, but a flag value meaning "don't get a lock" */
+#define NoLock					0
+
+#define AccessShareLock			1		/* SELECT */
+#define RowShareLock			2		/* SELECT FOR UPDATE/FOR SHARE */
+#define RowExclusiveLock		3		/* INSERT, UPDATE, DELETE */
+#define ShareUpdateExclusiveLock 4		/* VACUUM (non-FULL),ANALYZE, CREATE
+										 * INDEX CONCURRENTLY */
+#define ShareLock				5		/* CREATE INDEX (WITHOUT CONCURRENTLY) */
+#define ShareRowExclusiveLock	6		/* like EXCLUSIVE MODE, but allows ROW
+										 * SHARE */
+#define ExclusiveLock			7		/* blocks ROW SHARE/SELECT...FOR
+										 * UPDATE */
+#define AccessExclusiveLock		8		/* ALTER TABLE, DROP TABLE, VACUUM
+										 * FULL, and unqualified LOCK TABLE */
+
+
+/*
+ * LOCKTAG is the key information needed to look up a LOCK item in the
+ * lock hashtable.	A LOCKTAG value uniquely identifies a lockable object.
+ *
+ * The LockTagType enum defines the different kinds of objects we can lock.
+ * We can handle up to 256 different LockTagTypes.
+ */
+typedef enum LockTagType
+{
+	LOCKTAG_RELATION,			/* whole relation */
+	/* ID info for a relation is DB OID + REL OID; DB OID = 0 if shared */
+	LOCKTAG_RELATION_EXTEND,	/* the right to extend a relation */
+	/* same ID info as RELATION */
+	LOCKTAG_PAGE,				/* one page of a relation */
+	/* ID info for a page is RELATION info + BlockNumber */
+	LOCKTAG_TUPLE,				/* one physical tuple */
+	/* ID info for a tuple is PAGE info + OffsetNumber */
+	LOCKTAG_TRANSACTION,		/* transaction (for waiting for xact done) */
+	/* ID info for a transaction is its TransactionId */
+	LOCKTAG_VIRTUALTRANSACTION, /* virtual transaction (ditto) */
+	/* ID info for a virtual transaction is its VirtualTransactionId */
+	LOCKTAG_OBJECT,				/* non-relation database object */
+	/* ID info for an object is DB OID + CLASS OID + OBJECT OID + SUBID */
+
+	/*
+	 * Note: object ID has same representation as in pg_depend and
+	 * pg_description, but notice that we are constraining SUBID to 16 bits.
+	 * Also, we use DB OID = 0 for shared objects such as tablespaces.
+	 */
+	LOCKTAG_USERLOCK,			/* reserved for old contrib/userlock code */
+	LOCKTAG_ADVISORY			/* advisory user locks */
+} LockTagType;
+
+#define LOCKTAG_LAST_TYPE	LOCKTAG_ADVISORY
+
+/*
+ * The LOCKTAG struct is defined with malice aforethought to fit into 16
+ * bytes with no padding.  Note that this would need adjustment if we were
+ * to widen Oid, BlockNumber, or TransactionId to more than 32 bits.
+ *
+ * We include lockmethodid in the locktag so that a single hash table in
+ * shared memory can store locks of different lockmethods.
+ */
+typedef struct LOCKTAG
+{
+	uint32		locktag_field1; /* a 32-bit ID field */
+	uint32		locktag_field2; /* a 32-bit ID field */
+	uint32		locktag_field3; /* a 32-bit ID field */
+	uint16		locktag_field4; /* a 16-bit ID field */
+	uint8		locktag_type;	/* see enum LockTagType */
+	uint8		locktag_lockmethodid;	/* lockmethod indicator */
+} LOCKTAG;
+
+/*
+ * These macros define how we map logical IDs of lockable objects into
+ * the physical fields of LOCKTAG.	Use these to set up LOCKTAG values,
+ * rather than accessing the fields directly.  Note multiple eval of target!
+ */
+#define SET_LOCKTAG_RELATION(locktag,dboid,reloid) \
+	((locktag).locktag_field1 = (dboid), \
+	 (locktag).locktag_field2 = (reloid), \
+	 (locktag).locktag_field3 = 0, \
+	 (locktag).locktag_field4 = 0, \
+	 (locktag).locktag_type = LOCKTAG_RELATION, \
+	 (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
+#define SET_LOCKTAG_RELATION_EXTEND(locktag,dboid,reloid) \
+	((locktag).locktag_field1 = (dboid), \
+	 (locktag).locktag_field2 = (reloid), \
+	 (locktag).locktag_field3 = 0, \
+	 (locktag).locktag_field4 = 0, \
+	 (locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \
+	 (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
+#define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \
+	((locktag).locktag_field1 = (dboid), \
+	 (locktag).locktag_field2 = (reloid), \
+	 (locktag).locktag_field3 = (blocknum), \
+	 (locktag).locktag_field4 = 0, \
+	 (locktag).locktag_type = LOCKTAG_PAGE, \
+	 (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
+#define SET_LOCKTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \
+	((locktag).locktag_field1 = (dboid), \
+	 (locktag).locktag_field2 = (reloid), \
+	 (locktag).locktag_field3 = (blocknum), \
+	 (locktag).locktag_field4 = (offnum), \
+	 (locktag).locktag_type = LOCKTAG_TUPLE, \
+	 (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
+#define SET_LOCKTAG_TRANSACTION(locktag,xid) \
+	((locktag).locktag_field1 = (xid), \
+	 (locktag).locktag_field2 = 0, \
+	 (locktag).locktag_field3 = 0, \
+	 (locktag).locktag_field4 = 0, \
+	 (locktag).locktag_type = LOCKTAG_TRANSACTION, \
+	 (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
+#define SET_LOCKTAG_VIRTUALTRANSACTION(locktag,vxid) \
+	((locktag).locktag_field1 = (vxid).backendId, \
+	 (locktag).locktag_field2 = (vxid).localTransactionId, \
+	 (locktag).locktag_field3 = 0, \
+	 (locktag).locktag_field4 = 0, \
+	 (locktag).locktag_type = LOCKTAG_VIRTUALTRANSACTION, \
+	 (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
+#define SET_LOCKTAG_OBJECT(locktag,dboid,classoid,objoid,objsubid) \
+	((locktag).locktag_field1 = (dboid), \
+	 (locktag).locktag_field2 = (classoid), \
+	 (locktag).locktag_field3 = (objoid), \
+	 (locktag).locktag_field4 = (objsubid), \
+	 (locktag).locktag_type = LOCKTAG_OBJECT, \
+	 (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
+#define SET_LOCKTAG_ADVISORY(locktag,id1,id2,id3,id4) \
+	((locktag).locktag_field1 = (id1), \
+	 (locktag).locktag_field2 = (id2), \
+	 (locktag).locktag_field3 = (id3), \
+	 (locktag).locktag_field4 = (id4), \
+	 (locktag).locktag_type = LOCKTAG_ADVISORY, \
+	 (locktag).locktag_lockmethodid = USER_LOCKMETHOD)
+
+
+/*
+ * Per-locked-object lock information:
+ *
+ * tag -- uniquely identifies the object being locked
+ * grantMask -- bitmask for all lock types currently granted on this object.
+ * waitMask -- bitmask for all lock types currently awaited on this object.
+ * procLocks -- list of PROCLOCK objects for this lock.
+ * waitProcs -- queue of processes waiting for this lock.
+ * requested -- count of each lock type currently requested on the lock
+ *		(includes requests already granted!!).
+ * nRequested -- total requested locks of all types.
+ * granted -- count of each lock type currently granted on the lock.
+ * nGranted -- total granted locks of all types.
+ *
+ * Note: these counts count 1 for each backend.  Internally to a backend,
+ * there may be multiple grabs on a particular lock, but this is not reflected
+ * into shared memory.
+ */
+typedef struct LOCK
+{
+	/* hash key */
+	LOCKTAG		tag;			/* unique identifier of lockable object */
+
+	/* data */
+	LOCKMASK	grantMask;		/* bitmask for lock types already granted */
+	LOCKMASK	waitMask;		/* bitmask for lock types awaited */
+	SHM_QUEUE	procLocks;		/* list of PROCLOCK objects assoc. with lock */
+	PROC_QUEUE	waitProcs;		/* list of PGPROC objects waiting on lock */
+	int			requested[MAX_LOCKMODES];		/* counts of requested locks */
+	int			nRequested;		/* total of requested[] array */
+	int			granted[MAX_LOCKMODES]; /* counts of granted locks */
+	int			nGranted;		/* total of granted[] array */
+} LOCK;
+
+#define LOCK_LOCKMETHOD(lock) ((LOCKMETHODID) (lock).tag.locktag_lockmethodid)
+
+
+/*
+ * We may have several different backends holding or awaiting locks
+ * on the same lockable object.  We need to store some per-holder/waiter
+ * information for each such holder (or would-be holder).  This is kept in
+ * a PROCLOCK struct.
+ *
+ * PROCLOCKTAG is the key information needed to look up a PROCLOCK item in the
+ * proclock hashtable.	A PROCLOCKTAG value uniquely identifies the combination
+ * of a lockable object and a holder/waiter for that object.  (We can use
+ * pointers here because the PROCLOCKTAG need only be unique for the lifespan
+ * of the PROCLOCK, and it will never outlive the lock or the proc.)
+ *
+ * Internally to a backend, it is possible for the same lock to be held
+ * for different purposes: the backend tracks transaction locks separately
+ * from session locks.	However, this is not reflected in the shared-memory
+ * state: we only track which backend(s) hold the lock.  This is OK since a
+ * backend can never block itself.
+ *
+ * The holdMask field shows the already-granted locks represented by this
+ * proclock.  Note that there will be a proclock object, possibly with
+ * zero holdMask, for any lock that the process is currently waiting on.
+ * Otherwise, proclock objects whose holdMasks are zero are recycled
+ * as soon as convenient.
+ *
+ * releaseMask is workspace for LockReleaseAll(): it shows the locks due
+ * to be released during the current call.	This must only be examined or
+ * set by the backend owning the PROCLOCK.
+ *
+ * Each PROCLOCK object is linked into lists for both the associated LOCK
+ * object and the owning PGPROC object.  Note that the PROCLOCK is entered
+ * into these lists as soon as it is created, even if no lock has yet been
+ * granted.  A PGPROC that is waiting for a lock to be granted will also be
+ * linked into the lock's waitProcs queue.
+ */
+typedef struct PROCLOCKTAG
+{
+	/* NB: we assume this struct contains no padding! */
+	LOCK	   *myLock;			/* link to per-lockable-object information */
+	PGPROC	   *myProc;			/* link to PGPROC of owning backend */
+} PROCLOCKTAG;
+
+typedef struct PROCLOCK
+{
+	/* tag */
+	PROCLOCKTAG tag;			/* unique identifier of proclock object */
+
+	/* data */
+	LOCKMASK	holdMask;		/* bitmask for lock types currently held */
+	LOCKMASK	releaseMask;	/* bitmask for lock types to be released */
+	SHM_QUEUE	lockLink;		/* list link in LOCK's list of proclocks */
+	SHM_QUEUE	procLink;		/* list link in PGPROC's list of proclocks */
+} PROCLOCK;
+
+#define PROCLOCK_LOCKMETHOD(proclock) \
+	LOCK_LOCKMETHOD(*((proclock).tag.myLock))
+
+/*
+ * Each backend also maintains a local hash table with information about each
+ * lock it is currently interested in.	In particular the local table counts
+ * the number of times that lock has been acquired.  This allows multiple
+ * requests for the same lock to be executed without additional accesses to
+ * shared memory.  We also track the number of lock acquisitions per
+ * ResourceOwner, so that we can release just those locks belonging to a
+ * particular ResourceOwner.
+ */
+typedef struct LOCALLOCKTAG
+{
+	LOCKTAG		lock;			/* identifies the lockable object */
+	LOCKMODE	mode;			/* lock mode for this table entry */
+} LOCALLOCKTAG;
+
+typedef struct LOCALLOCKOWNER
+{
+	/*
+	 * Note: if owner is NULL then the lock is held on behalf of the session;
+	 * otherwise it is held on behalf of my current transaction.
+	 *
+	 * Must use a forward struct reference to avoid circularity.
+	 */
+	struct ResourceOwnerData *owner;
+	int64		nLocks;			/* # of times held by this owner */
+} LOCALLOCKOWNER;
+
+typedef struct LOCALLOCK
+{
+	/* tag */
+	LOCALLOCKTAG tag;			/* unique identifier of locallock entry */
+
+	/* data */
+	LOCK	   *lock;			/* associated LOCK object in shared mem */
+	PROCLOCK   *proclock;		/* associated PROCLOCK object in shmem */
+	uint32		hashcode;		/* copy of LOCKTAG's hash value */
+	int64		nLocks;			/* total number of times lock is held */
+	int			numLockOwners;	/* # of relevant ResourceOwners */
+	int			maxLockOwners;	/* allocated size of array */
+	bool		holdsStrongLockCount;	/* bumped FastPathStrongRelatonLocks? */
+	LOCALLOCKOWNER *lockOwners; /* dynamically resizable array */
+} LOCALLOCK;
+
+#define LOCALLOCK_LOCKMETHOD(llock) ((llock).tag.lock.locktag_lockmethodid)
+
+
+/*
+ * These structures hold information passed from lmgr internals to the lock
+ * listing user-level functions (in lockfuncs.c).
+ */
+
+typedef struct LockInstanceData
+{
+	LOCKTAG		locktag;		/* locked object */
+	LOCKMASK	holdMask;		/* locks held by this PGPROC */
+	LOCKMODE	waitLockMode;	/* lock awaited by this PGPROC, if any */
+	BackendId	backend;		/* backend ID of this PGPROC */
+	LocalTransactionId lxid;	/* local transaction ID of this PGPROC */
+	int			pid;			/* pid of this PGPROC */
+	bool		fastpath;		/* taken via fastpath? */
+} LockInstanceData;
+
+typedef struct LockData
+{
+	int			nelements;		/* The length of the array */
+	LockInstanceData *locks;
+} LockData;
+
+
+/* Result codes for LockAcquire() */
+typedef enum
+{
+	LOCKACQUIRE_NOT_AVAIL,		/* lock not available, and dontWait=true */
+	LOCKACQUIRE_OK,				/* lock successfully acquired */
+	LOCKACQUIRE_ALREADY_HELD	/* incremented count for lock already held */
+} LockAcquireResult;
+
+/* Deadlock states identified by DeadLockCheck() */
+typedef enum
+{
+	DS_NOT_YET_CHECKED,			/* no deadlock check has run yet */
+	DS_NO_DEADLOCK,				/* no deadlock detected */
+	DS_SOFT_DEADLOCK,			/* deadlock avoided by queue rearrangement */
+	DS_HARD_DEADLOCK,			/* deadlock, no way out but ERROR */
+	DS_BLOCKED_BY_AUTOVACUUM	/* no deadlock; queue blocked by autovacuum
+								 * worker */
+} DeadLockState;
+
+
+/*
+ * The lockmgr's shared hash tables are partitioned to reduce contention.
+ * To determine which partition a given locktag belongs to, compute the tag's
+ * hash code with LockTagHashCode(), then apply one of these macros.
+ * NB: NUM_LOCK_PARTITIONS must be a power of 2!
+ */
+#define LockHashPartition(hashcode) \
+	((hashcode) % NUM_LOCK_PARTITIONS)
+#define LockHashPartitionLock(hashcode) \
+	((LWLockId) (FirstLockMgrLock + LockHashPartition(hashcode)))
+
+
+/*
+ * function prototypes
+ */
+extern void InitLocks(void);
+extern LockMethod GetLocksMethodTable(const LOCK *lock);
+extern uint32 LockTagHashCode(const LOCKTAG *locktag);
+extern LockAcquireResult LockAcquire(const LOCKTAG *locktag,
+			LOCKMODE lockmode,
+			bool sessionLock,
+			bool dontWait);
+extern LockAcquireResult LockAcquireExtended(const LOCKTAG *locktag,
+					LOCKMODE lockmode,
+					bool sessionLock,
+					bool dontWait,
+					bool report_memory_error);
+extern void AbortStrongLockAcquire(void);
+extern bool LockRelease(const LOCKTAG *locktag,
+			LOCKMODE lockmode, bool sessionLock);
+extern void LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks);
+extern void LockReleaseSession(LOCKMETHODID lockmethodid);
+extern void LockReleaseCurrentOwner(void);
+extern void LockReassignCurrentOwner(void);
+extern bool LockHasWaiters(const LOCKTAG *locktag,
+			   LOCKMODE lockmode, bool sessionLock);
+extern VirtualTransactionId *GetLockConflicts(const LOCKTAG *locktag,
+				 LOCKMODE lockmode);
+extern void AtPrepare_Locks(void);
+extern void PostPrepare_Locks(TransactionId xid);
+extern int LockCheckConflicts(LockMethod lockMethodTable,
+				   LOCKMODE lockmode,
+				   LOCK *lock, PROCLOCK *proclock, PGPROC *proc);
+extern void GrantLock(LOCK *lock, PROCLOCK *proclock, LOCKMODE lockmode);
+extern void GrantAwaitedLock(void);
+extern void RemoveFromWaitQueue(PGPROC *proc, uint32 hashcode);
+extern Size LockShmemSize(void);
+extern LockData *GetLockStatusData(void);
+
+extern void ReportLockTableError(bool report);
+
+typedef struct xl_standby_lock
+{
+	TransactionId xid;			/* xid of holder of AccessExclusiveLock */
+	Oid			dbOid;
+	Oid			relOid;
+} xl_standby_lock;
+
+extern xl_standby_lock *GetRunningTransactionLocks(int *nlocks);
+extern const char *GetLockmodeName(LOCKMETHODID lockmethodid, LOCKMODE mode);
+
+extern void lock_twophase_recover(TransactionId xid, uint16 info,
+					  void *recdata, uint32 len);
+extern void lock_twophase_postcommit(TransactionId xid, uint16 info,
+						 void *recdata, uint32 len);
+extern void lock_twophase_postabort(TransactionId xid, uint16 info,
+						void *recdata, uint32 len);
+extern void lock_twophase_standby_recover(TransactionId xid, uint16 info,
+							  void *recdata, uint32 len);
+
+extern DeadLockState DeadLockCheck(PGPROC *proc);
+extern PGPROC *GetBlockingAutoVacuumPgproc(void);
+extern void DeadLockReport(void);
+extern void RememberSimpleDeadLock(PGPROC *proc1,
+					   LOCKMODE lockmode,
+					   LOCK *lock,
+					   PGPROC *proc2);
+extern void InitDeadLockChecking(void);
+
+#ifdef LOCK_DEBUG
+extern void DumpLocks(PGPROC *proc);
+extern void DumpAllLocks(void);
+#endif
+
+/* Lock a VXID (used to wait for a transaction to finish) */
+extern void VirtualXactLockTableInsert(VirtualTransactionId vxid);
+extern void VirtualXactLockTableCleanup(void);
+extern bool VirtualXactLock(VirtualTransactionId vxid, bool wait);
+
+#endif   /* LOCK_H */
--- a/pg_include/storage/lwlock.h
+++ b/pg_include/storage/lwlock.h
@@ -0,0 +1,122 @@
+/*-------------------------------------------------------------------------
+ *
+ * lwlock.h
+ *	  Lightweight lock manager
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/lwlock.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LWLOCK_H
+#define LWLOCK_H
+
+/*
+ * It's a bit odd to declare NUM_BUFFER_PARTITIONS and NUM_LOCK_PARTITIONS
+ * here, but we need them to set up enum LWLockId correctly, and having
+ * this file include lock.h or bufmgr.h would be backwards.
+ */
+
+/* Number of partitions of the shared buffer mapping hashtable */
+#define NUM_BUFFER_PARTITIONS  16
+
+/* Number of partitions the shared lock tables are divided into */
+#define LOG2_NUM_LOCK_PARTITIONS  4
+#define NUM_LOCK_PARTITIONS  (1 << LOG2_NUM_LOCK_PARTITIONS)
+
+/* Number of partitions the shared predicate lock tables are divided into */
+#define LOG2_NUM_PREDICATELOCK_PARTITIONS  4
+#define NUM_PREDICATELOCK_PARTITIONS  (1 << LOG2_NUM_PREDICATELOCK_PARTITIONS)
+
+/*
+ * We have a number of predefined LWLocks, plus a bunch of LWLocks that are
+ * dynamically assigned (e.g., for shared buffers).  The LWLock structures
+ * live in shared memory (since they contain shared data) and are identified
+ * by values of this enumerated type.  We abuse the notion of an enum somewhat
+ * by allowing values not listed in the enum declaration to be assigned.
+ * The extra value MaxDynamicLWLock is there to keep the compiler from
+ * deciding that the enum can be represented as char or short ...
+ *
+ * If you remove a lock, please replace it with a placeholder. This retains
+ * the lock numbering, which is helpful for DTrace and other external
+ * debugging scripts.
+ */
+typedef enum LWLockId
+{
+	BufFreelistLock,
+	ShmemIndexLock,
+	OidGenLock,
+	XidGenLock,
+	ProcArrayLock,
+	SInvalReadLock,
+	SInvalWriteLock,
+	WALInsertLock,
+	WALWriteLock,
+	ControlFileLock,
+	CheckpointLock,
+	CLogControlLock,
+	SubtransControlLock,
+	MultiXactGenLock,
+	MultiXactOffsetControlLock,
+	MultiXactMemberControlLock,
+	RelCacheInitLock,
+	CheckpointerCommLock,
+	TwoPhaseStateLock,
+	TablespaceCreateLock,
+	BtreeVacuumLock,
+	AddinShmemInitLock,
+	AutovacuumLock,
+	AutovacuumScheduleLock,
+	SyncScanLock,
+	RelationMappingLock,
+	AsyncCtlLock,
+	AsyncQueueLock,
+	SerializableXactHashLock,
+	SerializableFinishedListLock,
+	SerializablePredicateLockListLock,
+	OldSerXidLock,
+	SyncRepLock,
+	/* Individual lock IDs end here */
+	FirstBufMappingLock,
+	FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
+	FirstPredicateLockMgrLock = FirstLockMgrLock + NUM_LOCK_PARTITIONS,
+
+	/* must be last except for MaxDynamicLWLock: */
+	NumFixedLWLocks = FirstPredicateLockMgrLock + NUM_PREDICATELOCK_PARTITIONS,
+
+	MaxDynamicLWLock = 1000000000
+} LWLockId;
+
+
+typedef enum LWLockMode
+{
+	LW_EXCLUSIVE,
+	LW_SHARED,
+	LW_WAIT_UNTIL_FREE			/* A special mode used in PGPROC->lwlockMode,
+								 * when waiting for lock to become free. Not
+								 * to be used as LWLockAcquire argument */
+} LWLockMode;
+
+
+#ifdef LOCK_DEBUG
+extern bool Trace_lwlocks;
+#endif
+
+extern LWLockId LWLockAssign(void);
+extern void LWLockAcquire(LWLockId lockid, LWLockMode mode);
+extern bool LWLockConditionalAcquire(LWLockId lockid, LWLockMode mode);
+extern bool LWLockAcquireOrWait(LWLockId lockid, LWLockMode mode);
+extern void LWLockRelease(LWLockId lockid);
+extern void LWLockReleaseAll(void);
+extern bool LWLockHeldByMe(LWLockId lockid);
+
+extern int	NumLWLocks(void);
+extern Size LWLockShmemSize(void);
+extern void CreateLWLocks(void);
+
+extern void RequestAddinLWLocks(int n);
+
+#endif   /* LWLOCK_H */
--- a/pg_include/storage/off.h
+++ b/pg_include/storage/off.h
@@ -0,0 +1,58 @@
+/*-------------------------------------------------------------------------
+ *
+ * off.h
+ *	  POSTGRES disk "offset" definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/off.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef OFF_H
+#define OFF_H
+
+#include "storage/itemid.h"
+/*
+ * OffsetNumber:
+ *
+ * this is a 1-based index into the linp (ItemIdData) array in the
+ * header of each disk page.
+ */
+typedef uint16 OffsetNumber;
+
+#define InvalidOffsetNumber		((OffsetNumber) 0)
+#define FirstOffsetNumber		((OffsetNumber) 1)
+#define MaxOffsetNumber			((OffsetNumber) (BLCKSZ / sizeof(ItemIdData)))
+#define OffsetNumberMask		(0xffff)		/* valid uint16 bits */
+
+/* ----------------
+ *		support macros
+ * ----------------
+ */
+
+/*
+ * OffsetNumberIsValid
+ *		True iff the offset number is valid.
+ */
+#define OffsetNumberIsValid(offsetNumber) \
+	((bool) ((offsetNumber != InvalidOffsetNumber) && \
+			 (offsetNumber <= MaxOffsetNumber)))
+
+/*
+ * OffsetNumberNext
+ * OffsetNumberPrev
+ *		Increments/decrements the argument.  These macros look pointless
+ *		but they help us disambiguate the different manipulations on
+ *		OffsetNumbers (e.g., sometimes we subtract one from an
+ *		OffsetNumber to move back, and sometimes we do so to form a
+ *		real C array index).
+ */
+#define OffsetNumberNext(offsetNumber) \
+	((OffsetNumber) (1 + (offsetNumber)))
+#define OffsetNumberPrev(offsetNumber) \
+	((OffsetNumber) (-1 + (offsetNumber)))
+
+#endif   /* OFF_H */
--- a/pg_include/storage/pg_sema.h
+++ b/pg_include/storage/pg_sema.h
@@ -0,0 +1,83 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_sema.h
+ *	  Platform-independent API for semaphores.
+ *
+ * PostgreSQL requires counting semaphores (the kind that keep track of
+ * multiple unlock operations, and will allow an equal number of subsequent
+ * lock operations before blocking).  The underlying implementation is
+ * not the same on every platform.	This file defines the API that must
+ * be provided by each port.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/pg_sema.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_SEMA_H
+#define PG_SEMA_H
+
+/*
+ * PGSemaphoreData and pointer type PGSemaphore are the data structure
+ * representing an individual semaphore.  The contents of PGSemaphoreData
+ * vary across implementations and must never be touched by platform-
+ * independent code.  PGSemaphoreData structures are always allocated
+ * in shared memory (to support implementations where the data changes during
+ * lock/unlock).
+ *
+ * pg_config.h must define exactly one of the USE_xxx_SEMAPHORES symbols.
+ */
+
+#ifdef USE_NAMED_POSIX_SEMAPHORES
+
+#include <semaphore.h>
+
+typedef sem_t *PGSemaphoreData;
+#endif
+
+#ifdef USE_UNNAMED_POSIX_SEMAPHORES
+
+#include <semaphore.h>
+
+typedef sem_t PGSemaphoreData;
+#endif
+
+#ifdef USE_SYSV_SEMAPHORES
+
+typedef struct PGSemaphoreData
+{
+	int			semId;			/* semaphore set identifier */
+	int			semNum;			/* semaphore number within set */
+} PGSemaphoreData;
+#endif
+
+#ifdef USE_WIN32_SEMAPHORES
+
+typedef HANDLE PGSemaphoreData;
+#endif
+
+typedef PGSemaphoreData *PGSemaphore;
+
+
+/* Module initialization (called during postmaster start or shmem reinit) */
+extern void PGReserveSemaphores(int maxSemas, int port);
+
+/* Initialize a PGSemaphore structure to represent a sema with count 1 */
+extern void PGSemaphoreCreate(PGSemaphore sema);
+
+/* Reset a previously-initialized PGSemaphore to have count 0 */
+extern void PGSemaphoreReset(PGSemaphore sema);
+
+/* Lock a semaphore (decrement count), blocking if count would be < 0 */
+extern void PGSemaphoreLock(PGSemaphore sema, bool interruptOK);
+
+/* Unlock a semaphore (increment count) */
+extern void PGSemaphoreUnlock(PGSemaphore sema);
+
+/* Lock a semaphore only if able to do so without blocking */
+extern bool PGSemaphoreTryLock(PGSemaphore sema);
+
+#endif   /* PG_SEMA_H */
--- a/pg_include/storage/pg_shmem.h
+++ b/pg_include/storage/pg_shmem.h
@@ -0,0 +1,58 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_shmem.h
+ *	  Platform-independent API for shared memory support.
+ *
+ * Every port is expected to support shared memory with approximately
+ * SysV-ish semantics; in particular, a memory block is not anonymous
+ * but has an ID, and we must be able to tell whether there are any
+ * remaining processes attached to a block of a specified ID.
+ *
+ * To simplify life for the SysV implementation, the ID is assumed to
+ * consist of two unsigned long values (these are key and ID in SysV
+ * terms).	Other platforms may ignore the second value if they need
+ * only one ID number.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/pg_shmem.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_SHMEM_H
+#define PG_SHMEM_H
+
+typedef struct PGShmemHeader	/* standard header for all Postgres shmem */
+{
+	int32		magic;			/* magic # to identify Postgres segments */
+#define PGShmemMagic  679834894
+	pid_t		creatorPID;		/* PID of creating process */
+	Size		totalsize;		/* total size of segment */
+	Size		freeoffset;		/* offset to first free space */
+	void	   *index;			/* pointer to ShmemIndex table */
+#ifndef WIN32					/* Windows doesn't have useful inode#s */
+	dev_t		device;			/* device data directory is on */
+	ino_t		inode;			/* inode number of data directory */
+#endif
+} PGShmemHeader;
+
+
+#ifdef EXEC_BACKEND
+#ifndef WIN32
+extern unsigned long UsedShmemSegID;
+#else
+extern HANDLE UsedShmemSegID;
+#endif
+extern void *UsedShmemSegAddr;
+
+extern void PGSharedMemoryReAttach(void);
+#endif
+
+extern PGShmemHeader *PGSharedMemoryCreate(Size size, bool makePrivate,
+					 int port);
+extern bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2);
+extern void PGSharedMemoryDetach(void);
+
+#endif   /* PG_SHMEM_H */
--- a/pg_include/storage/pmsignal.h
+++ b/pg_include/storage/pmsignal.h
@@ -0,0 +1,55 @@
+/*-------------------------------------------------------------------------
+ *
+ * pmsignal.h
+ *	  routines for signaling the postmaster from its child processes
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/pmsignal.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PMSIGNAL_H
+#define PMSIGNAL_H
+
+/*
+ * Reasons for signaling the postmaster.  We can cope with simultaneous
+ * signals for different reasons.  If the same reason is signaled multiple
+ * times in quick succession, however, the postmaster is likely to observe
+ * only one notification of it.  This is okay for the present uses.
+ */
+typedef enum
+{
+	PMSIGNAL_RECOVERY_STARTED,	/* recovery has started */
+	PMSIGNAL_BEGIN_HOT_STANDBY, /* begin Hot Standby */
+	PMSIGNAL_WAKEN_ARCHIVER,	/* send a NOTIFY signal to xlog archiver */
+	PMSIGNAL_ROTATE_LOGFILE,	/* send SIGUSR1 to syslogger to rotate logfile */
+	PMSIGNAL_START_AUTOVAC_LAUNCHER,	/* start an autovacuum launcher */
+	PMSIGNAL_START_AUTOVAC_WORKER,		/* start an autovacuum worker */
+	PMSIGNAL_START_WALRECEIVER, /* start a walreceiver */
+	PMSIGNAL_ADVANCE_STATE_MACHINE,		/* advance postmaster's state machine */
+
+	NUM_PMSIGNALS				/* Must be last value of enum! */
+} PMSignalReason;
+
+/* PMSignalData is an opaque struct, details known only within pmsignal.c */
+typedef struct PMSignalData PMSignalData;
+
+/*
+ * prototypes for functions in pmsignal.c
+ */
+extern Size PMSignalShmemSize(void);
+extern void PMSignalShmemInit(void);
+extern void SendPostmasterSignal(PMSignalReason reason);
+extern bool CheckPostmasterSignal(PMSignalReason reason);
+extern int	AssignPostmasterChildSlot(void);
+extern bool ReleasePostmasterChildSlot(int slot);
+extern bool IsPostmasterChildWalSender(int slot);
+extern void MarkPostmasterChildActive(void);
+extern void MarkPostmasterChildInactive(void);
+extern void MarkPostmasterChildWalSender(void);
+extern bool PostmasterIsAlive(void);
+
+#endif   /* PMSIGNAL_H */
--- a/pg_include/storage/pos.h
+++ b/pg_include/storage/pos.h
@@ -0,0 +1,64 @@
+/*-------------------------------------------------------------------------
+ *
+ * pos.h
+ *	  POSTGRES "position" definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/pos.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef POS_H
+#define POS_H
+
+
+/*
+ * a 'position' used to be <pagenumber, offset> in postgres.  this has
+ * been changed to just <offset> as the notion of having multiple pages
+ * within a block has been removed.
+ *
+ * the 'offset' abstraction is somewhat confusing.	it is NOT a byte
+ * offset within the page; instead, it is an offset into the line
+ * pointer array contained on every page that store (heap or index)
+ * tuples.
+ */
+typedef bits16 PositionIdData;
+typedef PositionIdData *PositionId;
+
+/* ----------------
+ *		support macros
+ * ----------------
+ */
+
+/*
+ * PositionIdIsValid
+ *		True iff the position identifier is valid.
+ */
+#define PositionIdIsValid(positionId) \
+	PointerIsValid(positionId)
+
+/*
+ * PositionIdSetInvalid
+ *		Make an invalid position.
+ */
+#define PositionIdSetInvalid(positionId) \
+	*(positionId) = (bits16) 0
+
+/*
+ * PositionIdSet
+ *		Sets a position identifier to the specified value.
+ */
+#define PositionIdSet(positionId, offsetNumber) \
+	*(positionId) = (offsetNumber)
+
+/*
+ * PositionIdGetOffsetNumber
+ *		Retrieve the offset number from a position identifier.
+ */
+#define PositionIdGetOffsetNumber(positionId) \
+	((OffsetNumber) *(positionId))
+
+#endif   /* POS_H */
--- a/pg_include/storage/predicate.h
+++ b/pg_include/storage/predicate.h
@@ -0,0 +1,73 @@
+/*-------------------------------------------------------------------------
+ *
+ * predicate.h
+ *	  POSTGRES public predicate locking definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/predicate.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PREDICATE_H
+#define PREDICATE_H
+
+#include "utils/relcache.h"
+#include "utils/snapshot.h"
+
+
+/*
+ * GUC variables
+ */
+extern int	max_predicate_locks_per_xact;
+
+
+/* Number of SLRU buffers to use for predicate locking */
+#define NUM_OLDSERXID_BUFFERS	16
+
+
+/*
+ * function prototypes
+ */
+
+/* housekeeping for shared memory predicate lock structures */
+extern void InitPredicateLocks(void);
+extern Size PredicateLockShmemSize(void);
+
+extern void CheckPointPredicate(void);
+
+/* predicate lock reporting */
+extern bool PageIsPredicateLocked(Relation relation, BlockNumber blkno);
+
+/* predicate lock maintenance */
+extern Snapshot GetSerializableTransactionSnapshot(Snapshot snapshot);
+extern void SetSerializableTransactionSnapshot(Snapshot snapshot,
+								   TransactionId sourcexid);
+extern void RegisterPredicateLockingXid(TransactionId xid);
+extern void PredicateLockRelation(Relation relation, Snapshot snapshot);
+extern void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot);
+extern void PredicateLockTuple(Relation relation, HeapTuple tuple, Snapshot snapshot);
+extern void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno);
+extern void PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, BlockNumber newblkno);
+extern void TransferPredicateLocksToHeapRelation(Relation relation);
+extern void ReleasePredicateLocks(bool isCommit);
+
+/* conflict detection (may also trigger rollback) */
+extern void CheckForSerializableConflictOut(bool valid, Relation relation, HeapTuple tuple,
+								Buffer buffer, Snapshot snapshot);
+extern void CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer);
+extern void CheckTableForSerializableConflictIn(Relation relation);
+
+/* final rollback checking */
+extern void PreCommit_CheckForSerializationFailure(void);
+
+/* two-phase commit support */
+extern void AtPrepare_PredicateLocks(void);
+extern void PostPrepare_PredicateLocks(TransactionId xid);
+extern void PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit);
+extern void predicatelock_twophase_recover(TransactionId xid, uint16 info,
+							   void *recdata, uint32 len);
+
+#endif   /* PREDICATE_H */
--- a/pg_include/storage/predicate_internals.h
+++ b/pg_include/storage/predicate_internals.h
@@ -0,0 +1,490 @@
+/*-------------------------------------------------------------------------
+ *
+ * predicate_internals.h
+ *	  POSTGRES internal predicate locking definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/predicate_internals.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PREDICATE_INTERNALS_H
+#define PREDICATE_INTERNALS_H
+
+#include "storage/lock.h"
+
+/*
+ * Commit number.
+ */
+typedef uint64 SerCommitSeqNo;
+
+/*
+ * Reserved commit sequence numbers:
+ *	- 0 is reserved to indicate a non-existent SLRU entry; it cannot be
+ *	  used as a SerCommitSeqNo, even an invalid one
+ *	- InvalidSerCommitSeqNo is used to indicate a transaction that
+ *	  hasn't committed yet, so use a number greater than all valid
+ *	  ones to make comparison do the expected thing
+ *	- RecoverySerCommitSeqNo is used to refer to transactions that
+ *	  happened before a crash/recovery, since we restart the sequence
+ *	  at that point.  It's earlier than all normal sequence numbers,
+ *	  and is only used by recovered prepared transactions
+ */
+#define InvalidSerCommitSeqNo		((SerCommitSeqNo) UINT64CONST(0xFFFFFFFFFFFFFFFF))
+#define RecoverySerCommitSeqNo		((SerCommitSeqNo) 1)
+#define FirstNormalSerCommitSeqNo	((SerCommitSeqNo) 2)
+
+/*
+ * The SERIALIZABLEXACT struct contains information needed for each
+ * serializable database transaction to support SSI techniques.
+ *
+ * A home-grown list is maintained in shared memory to manage these.
+ * An entry is used when the serializable transaction acquires a snapshot.
+ * Unless the transaction is rolled back, this entry must generally remain
+ * until all concurrent transactions have completed.  (There are special
+ * optimizations for READ ONLY transactions which often allow them to be
+ * cleaned up earlier.)  A transaction which is rolled back is cleaned up
+ * as soon as possible.
+ *
+ * Eligibility for cleanup of committed transactions is generally determined
+ * by comparing the transaction's finishedBefore field to
+ * SerializableGlobalXmin.
+ */
+typedef struct SERIALIZABLEXACT
+{
+	VirtualTransactionId vxid;	/* The executing process always has one of
+								 * these. */
+
+	/*
+	 * We use two numbers to track the order that transactions commit. Before
+	 * commit, a transaction is marked as prepared, and prepareSeqNo is set.
+	 * Shortly after commit, it's marked as committed, and commitSeqNo is set.
+	 * This doesn't give a strict commit order, but these two values together
+	 * are good enough for us, as we can always err on the safe side and
+	 * assume that there's a conflict, if we can't be sure of the exact
+	 * ordering of two commits.
+	 *
+	 * Note that a transaction is marked as prepared for a short period during
+	 * commit processing, even if two-phase commit is not used. But with
+	 * two-phase commit, a transaction can stay in prepared state for some
+	 * time.
+	 */
+	SerCommitSeqNo prepareSeqNo;
+	SerCommitSeqNo commitSeqNo;
+
+	/* these values are not both interesting at the same time */
+	union
+	{
+		SerCommitSeqNo earliestOutConflictCommit;		/* when committed with
+														 * conflict out */
+		SerCommitSeqNo lastCommitBeforeSnapshot;		/* when not committed or
+														 * no conflict out */
+	}			SeqNo;
+	SHM_QUEUE	outConflicts;	/* list of write transactions whose data we
+								 * couldn't read. */
+	SHM_QUEUE	inConflicts;	/* list of read transactions which couldn't
+								 * see our write. */
+	SHM_QUEUE	predicateLocks; /* list of associated PREDICATELOCK objects */
+	SHM_QUEUE	finishedLink;	/* list link in
+								 * FinishedSerializableTransactions */
+
+	/*
+	 * for r/o transactions: list of concurrent r/w transactions that we could
+	 * potentially have conflicts with, and vice versa for r/w transactions
+	 */
+	SHM_QUEUE	possibleUnsafeConflicts;
+
+	TransactionId topXid;		/* top level xid for the transaction, if one
+								 * exists; else invalid */
+	TransactionId finishedBefore;		/* invalid means still running; else
+										 * the struct expires when no
+										 * serializable xids are before this. */
+	TransactionId xmin;			/* the transaction's snapshot xmin */
+	uint32		flags;			/* OR'd combination of values defined below */
+	int			pid;			/* pid of associated process */
+} SERIALIZABLEXACT;
+
+#define SXACT_FLAG_COMMITTED			0x00000001		/* already committed */
+#define SXACT_FLAG_PREPARED				0x00000002		/* about to commit */
+#define SXACT_FLAG_ROLLED_BACK			0x00000004		/* already rolled back */
+#define SXACT_FLAG_DOOMED				0x00000008		/* will roll back */
+/*
+ * The following flag actually means that the flagged transaction has a
+ * conflict out *to a transaction which committed ahead of it*.  It's hard
+ * to get that into a name of a reasonable length.
+ */
+#define SXACT_FLAG_CONFLICT_OUT			0x00000010
+#define SXACT_FLAG_READ_ONLY			0x00000020
+#define SXACT_FLAG_DEFERRABLE_WAITING	0x00000040
+#define SXACT_FLAG_RO_SAFE				0x00000080
+#define SXACT_FLAG_RO_UNSAFE			0x00000100
+#define SXACT_FLAG_SUMMARY_CONFLICT_IN	0x00000200
+#define SXACT_FLAG_SUMMARY_CONFLICT_OUT 0x00000400
+
+/*
+ * The following types are used to provide an ad hoc list for holding
+ * SERIALIZABLEXACT objects.  An HTAB is overkill, since there is no need to
+ * access these by key -- there are direct pointers to these objects where
+ * needed.	If a shared memory list is created, these types can probably be
+ * eliminated in favor of using the general solution.
+ */
+typedef struct PredXactListElementData
+{
+	SHM_QUEUE	link;
+	SERIALIZABLEXACT sxact;
+}	PredXactListElementData;
+
+typedef struct PredXactListElementData *PredXactListElement;
+
+#define PredXactListElementDataSize \
+		((Size)MAXALIGN(sizeof(PredXactListElementData)))
+
+typedef struct PredXactListData
+{
+	SHM_QUEUE	availableList;
+	SHM_QUEUE	activeList;
+
+	/*
+	 * These global variables are maintained when registering and cleaning up
+	 * serializable transactions.  They must be global across all backends,
+	 * but are not needed outside the predicate.c source file. Protected by
+	 * SerializableXactHashLock.
+	 */
+	TransactionId SxactGlobalXmin;		/* global xmin for active serializable
+										 * transactions */
+	int			SxactGlobalXminCount;	/* how many active serializable
+										 * transactions have this xmin */
+	int			WritableSxactCount;		/* how many non-read-only serializable
+										 * transactions are active */
+	SerCommitSeqNo LastSxactCommitSeqNo;		/* a strictly monotonically
+												 * increasing number for
+												 * commits of serializable
+												 * transactions */
+	/* Protected by SerializableXactHashLock. */
+	SerCommitSeqNo CanPartialClearThrough;		/* can clear predicate locks
+												 * and inConflicts for
+												 * committed transactions
+												 * through this seq no */
+	/* Protected by SerializableFinishedListLock. */
+	SerCommitSeqNo HavePartialClearedThrough;	/* have cleared through this
+												 * seq no */
+	SERIALIZABLEXACT *OldCommittedSxact;		/* shared copy of dummy sxact */
+
+	PredXactListElement element;
+}	PredXactListData;
+
+typedef struct PredXactListData *PredXactList;
+
+#define PredXactListDataSize \
+		((Size)MAXALIGN(sizeof(PredXactListData)))
+
+
+/*
+ * The following types are used to provide lists of rw-conflicts between
+ * pairs of transactions.  Since exactly the same information is needed,
+ * they are also used to record possible unsafe transaction relationships
+ * for purposes of identifying safe snapshots for read-only transactions.
+ *
+ * When a RWConflictData is not in use to record either type of relationship
+ * between a pair of transactions, it is kept on an "available" list.  The
+ * outLink field is used for maintaining that list.
+ */
+typedef struct RWConflictData
+{
+	SHM_QUEUE	outLink;		/* link for list of conflicts out from a sxact */
+	SHM_QUEUE	inLink;			/* link for list of conflicts in to a sxact */
+	SERIALIZABLEXACT *sxactOut;
+	SERIALIZABLEXACT *sxactIn;
+}	RWConflictData;
+
+typedef struct RWConflictData *RWConflict;
+
+#define RWConflictDataSize \
+		((Size)MAXALIGN(sizeof(RWConflictData)))
+
+typedef struct RWConflictPoolHeaderData
+{
+	SHM_QUEUE	availableList;
+	RWConflict	element;
+}	RWConflictPoolHeaderData;
+
+typedef struct RWConflictPoolHeaderData *RWConflictPoolHeader;
+
+#define RWConflictPoolHeaderDataSize \
+		((Size)MAXALIGN(sizeof(RWConflictPoolHeaderData)))
+
+
+/*
+ * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable
+ * transaction or any of its subtransactions.
+ */
+typedef struct SERIALIZABLEXIDTAG
+{
+	TransactionId xid;
+} SERIALIZABLEXIDTAG;
+
+/*
+ * The SERIALIZABLEXID struct provides a link from a TransactionId for a
+ * serializable transaction to the related SERIALIZABLEXACT record, even if
+ * the transaction has completed and its connection has been closed.
+ *
+ * These are created as new top level transaction IDs are first assigned to
+ * transactions which are participating in predicate locking.  This may
+ * never happen for a particular transaction if it doesn't write anything.
+ * They are removed with their related serializable transaction objects.
+ *
+ * The SubTransGetTopmostTransaction method is used where necessary to get
+ * from an XID which might be from a subtransaction to the top level XID.
+ */
+typedef struct SERIALIZABLEXID
+{
+	/* hash key */
+	SERIALIZABLEXIDTAG tag;
+
+	/* data */
+	SERIALIZABLEXACT *myXact;	/* pointer to the top level transaction data */
+} SERIALIZABLEXID;
+
+
+/*
+ * The PREDICATELOCKTARGETTAG struct identifies a database object which can
+ * be the target of predicate locks.
+ *
+ * Note that the hash function being used doesn't properly respect tag
+ * length -- it will go to a four byte boundary past the end of the tag.
+ * If you change this struct, make sure any slack space is initialized,
+ * so that any random bytes in the middle or at the end are not included
+ * in the hash.
+ *
+ * TODO SSI: If we always use the same fields for the same type of value, we
+ * should rename these.  Holding off until it's clear there are no exceptions.
+ * Since indexes are relations with blocks and tuples, it's looking likely that
+ * the rename will be possible.  If not, we may need to divide the last field
+ * and use part of it for a target type, so that we know how to interpret the
+ * data..
+ */
+typedef struct PREDICATELOCKTARGETTAG
+{
+	uint32		locktag_field1; /* a 32-bit ID field */
+	uint32		locktag_field2; /* a 32-bit ID field */
+	uint32		locktag_field3; /* a 32-bit ID field */
+	uint32		locktag_field4; /* a 32-bit ID field */
+	uint32		locktag_field5; /* a 32-bit ID field */
+} PREDICATELOCKTARGETTAG;
+
+/*
+ * The PREDICATELOCKTARGET struct represents a database object on which there
+ * are predicate locks.
+ *
+ * A hash list of these objects is maintained in shared memory.  An entry is
+ * added when a predicate lock is requested on an object which doesn't
+ * already have one.  An entry is removed when the last lock is removed from
+ * its list.
+ *
+ * Because a particular target might become obsolete, due to update to a new
+ * version, before the reading transaction is obsolete, we need some way to
+ * prevent errors from reuse of a tuple ID.  Rather than attempting to clean
+ * up the targets as the related tuples are pruned or vacuumed, we check the
+ * xmin on access.	This should be far less costly.
+ */
+typedef struct PREDICATELOCKTARGET
+{
+	/* hash key */
+	PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
+
+	/* data */
+	SHM_QUEUE	predicateLocks; /* list of PREDICATELOCK objects assoc. with
+								 * predicate lock target */
+} PREDICATELOCKTARGET;
+
+
+/*
+ * The PREDICATELOCKTAG struct identifies an individual predicate lock.
+ *
+ * It is the combination of predicate lock target (which is a lockable
+ * object) and a serializable transaction which has acquired a lock on that
+ * target.
+ */
+typedef struct PREDICATELOCKTAG
+{
+	PREDICATELOCKTARGET *myTarget;
+	SERIALIZABLEXACT *myXact;
+} PREDICATELOCKTAG;
+
+/*
+ * The PREDICATELOCK struct represents an individual lock.
+ *
+ * An entry can be created here when the related database object is read, or
+ * by promotion of multiple finer-grained targets.	All entries related to a
+ * serializable transaction are removed when that serializable transaction is
+ * cleaned up.	Entries can also be removed when they are combined into a
+ * single coarser-grained lock entry.
+ */
+typedef struct PREDICATELOCK
+{
+	/* hash key */
+	PREDICATELOCKTAG tag;		/* unique identifier of lock */
+
+	/* data */
+	SHM_QUEUE	targetLink;		/* list link in PREDICATELOCKTARGET's list of
+								 * predicate locks */
+	SHM_QUEUE	xactLink;		/* list link in SERIALIZABLEXACT's list of
+								 * predicate locks */
+	SerCommitSeqNo commitSeqNo; /* only used for summarized predicate locks */
+} PREDICATELOCK;
+
+
+/*
+ * The LOCALPREDICATELOCK struct represents a local copy of data which is
+ * also present in the PREDICATELOCK table, organized for fast access without
+ * needing to acquire a LWLock.  It is strictly for optimization.
+ *
+ * Each serializable transaction creates its own local hash table to hold a
+ * collection of these.  This information is used to determine when a number
+ * of fine-grained locks should be promoted to a single coarser-grained lock.
+ * The information is maintained more-or-less in parallel to the
+ * PREDICATELOCK data, but because this data is not protected by locks and is
+ * only used in an optimization heuristic, it is allowed to drift in a few
+ * corner cases where maintaining exact data would be expensive.
+ *
+ * The hash table is created when the serializable transaction acquires its
+ * snapshot, and its memory is released upon completion of the transaction.
+ */
+typedef struct LOCALPREDICATELOCK
+{
+	/* hash key */
+	PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
+
+	/* data */
+	bool		held;			/* is lock held, or just its children?	*/
+	int			childLocks;		/* number of child locks currently held */
+} LOCALPREDICATELOCK;
+
+
+/*
+ * The types of predicate locks which can be acquired.
+ */
+typedef enum PredicateLockTargetType
+{
+	PREDLOCKTAG_RELATION,
+	PREDLOCKTAG_PAGE,
+	PREDLOCKTAG_TUPLE
+	/* TODO SSI: Other types may be needed for index locking */
+} PredicateLockTargetType;
+
+
+/*
+ * This structure is used to quickly capture a copy of all predicate
+ * locks.  This is currently used only by the pg_lock_status function,
+ * which in turn is used by the pg_locks view.
+ */
+typedef struct PredicateLockData
+{
+	int			nelements;
+	PREDICATELOCKTARGETTAG *locktags;
+	SERIALIZABLEXACT *xacts;
+} PredicateLockData;
+
+
+/*
+ * These macros define how we map logical IDs of lockable objects into the
+ * physical fields of PREDICATELOCKTARGETTAG.	Use these to set up values,
+ * rather than accessing the fields directly.  Note multiple eval of target!
+ */
+#define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \
+	((locktag).locktag_field1 = (dboid), \
+	 (locktag).locktag_field2 = (reloid), \
+	 (locktag).locktag_field3 = InvalidBlockNumber, \
+	 (locktag).locktag_field4 = InvalidOffsetNumber, \
+	 (locktag).locktag_field5 = InvalidTransactionId)
+
+#define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \
+	((locktag).locktag_field1 = (dboid), \
+	 (locktag).locktag_field2 = (reloid), \
+	 (locktag).locktag_field3 = (blocknum), \
+	 (locktag).locktag_field4 = InvalidOffsetNumber, \
+	 (locktag).locktag_field5 = InvalidTransactionId)
+
+#define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum,xmin) \
+	((locktag).locktag_field1 = (dboid), \
+	 (locktag).locktag_field2 = (reloid), \
+	 (locktag).locktag_field3 = (blocknum), \
+	 (locktag).locktag_field4 = (offnum), \
+	 (locktag).locktag_field5 = (xmin))
+
+#define GET_PREDICATELOCKTARGETTAG_DB(locktag) \
+	((Oid) (locktag).locktag_field1)
+#define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \
+	((Oid) (locktag).locktag_field2)
+#define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \
+	((BlockNumber) (locktag).locktag_field3)
+#define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \
+	((OffsetNumber) (locktag).locktag_field4)
+#define GET_PREDICATELOCKTARGETTAG_XMIN(locktag) \
+	((TransactionId) (locktag).locktag_field5)
+#define GET_PREDICATELOCKTARGETTAG_TYPE(locktag)							 \
+	(((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \
+	 (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE :   \
+	  PREDLOCKTAG_RELATION))
+
+/*
+ * Two-phase commit statefile records. There are two types: for each
+ * transaction, we generate one per-transaction record and a variable
+ * number of per-predicate-lock records.
+ */
+typedef enum TwoPhasePredicateRecordType
+{
+	TWOPHASEPREDICATERECORD_XACT,
+	TWOPHASEPREDICATERECORD_LOCK
+} TwoPhasePredicateRecordType;
+
+/*
+ * Per-transaction information to reconstruct a SERIALIZABLEXACT. Not
+ * much is needed because most of it not meaningful for a recovered
+ * prepared transaction.
+ *
+ * In particular, we do not record the in and out conflict lists for a
+ * prepared transaction because the associated SERIALIZABLEXACTs will
+ * not be available after recovery. Instead, we simply record the
+ * existence of each type of conflict by setting the transaction's
+ * summary conflict in/out flag.
+ */
+typedef struct TwoPhasePredicateXactRecord
+{
+	TransactionId xmin;
+	uint32		flags;
+} TwoPhasePredicateXactRecord;
+
+/* Per-lock state */
+typedef struct TwoPhasePredicateLockRecord
+{
+	PREDICATELOCKTARGETTAG target;
+} TwoPhasePredicateLockRecord;
+
+typedef struct TwoPhasePredicateRecord
+{
+	TwoPhasePredicateRecordType type;
+	union
+	{
+		TwoPhasePredicateXactRecord xactRecord;
+		TwoPhasePredicateLockRecord lockRecord;
+	}			data;
+} TwoPhasePredicateRecord;
+
+/*
+ * Define a macro to use for an "empty" SERIALIZABLEXACT reference.
+ */
+#define InvalidSerializableXact ((SERIALIZABLEXACT *) NULL)
+
+
+/*
+ * Function definitions for functions needing awareness of predicate
+ * locking internals.
+ */
+extern PredicateLockData *GetPredicateLockStatusData(void);
+
+
+#endif   /* PREDICATE_INTERNALS_H */
--- a/pg_include/storage/proc.h
+++ b/pg_include/storage/proc.h
@@ -0,0 +1,264 @@
+/*-------------------------------------------------------------------------
+ *
+ * proc.h
+ *	  per-process shared memory data structures
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/proc.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _PROC_H_
+#define _PROC_H_
+
+#include "access/xlog.h"
+#include "datatype/timestamp.h"
+#include "storage/latch.h"
+#include "storage/lock.h"
+#include "storage/pg_sema.h"
+
+/*
+ * Each backend advertises up to PGPROC_MAX_CACHED_SUBXIDS TransactionIds
+ * for non-aborted subtransactions of its current top transaction.	These
+ * have to be treated as running XIDs by other backends.
+ *
+ * We also keep track of whether the cache overflowed (ie, the transaction has
+ * generated at least one subtransaction that didn't fit in the cache).
+ * If none of the caches have overflowed, we can assume that an XID that's not
+ * listed anywhere in the PGPROC array is not a running transaction.  Else we
+ * have to look at pg_subtrans.
+ */
+#define PGPROC_MAX_CACHED_SUBXIDS 64	/* XXX guessed-at value */
+
+struct XidCache
+{
+	TransactionId xids[PGPROC_MAX_CACHED_SUBXIDS];
+};
+
+/* Flags for PGXACT->vacuumFlags */
+#define		PROC_IS_AUTOVACUUM	0x01	/* is it an autovac worker? */
+#define		PROC_IN_VACUUM		0x02	/* currently running lazy vacuum */
+#define		PROC_IN_ANALYZE		0x04	/* currently running analyze */
+#define		PROC_VACUUM_FOR_WRAPAROUND 0x08		/* set by autovac only */
+
+/* flags reset at EOXact */
+#define		PROC_VACUUM_STATE_MASK (0x0E)
+
+/*
+ * We allow a small number of "weak" relation locks (AccesShareLock,
+ * RowShareLock, RowExclusiveLock) to be recorded in the PGPROC structure
+ * rather than the main lock table.  This eases contention on the lock
+ * manager LWLocks.  See storage/lmgr/README for additional details.
+ */
+#define		FP_LOCK_SLOTS_PER_BACKEND 16
+
+/*
+ * Each backend has a PGPROC struct in shared memory.  There is also a list of
+ * currently-unused PGPROC structs that will be reallocated to new backends.
+ *
+ * links: list link for any list the PGPROC is in.	When waiting for a lock,
+ * the PGPROC is linked into that lock's waitProcs queue.  A recycled PGPROC
+ * is linked into ProcGlobal's freeProcs list.
+ *
+ * Note: twophase.c also sets up a dummy PGPROC struct for each currently
+ * prepared transaction.  These PGPROCs appear in the ProcArray data structure
+ * so that the prepared transactions appear to be still running and are
+ * correctly shown as holding locks.  A prepared transaction PGPROC can be
+ * distinguished from a real one at need by the fact that it has pid == 0.
+ * The semaphore and lock-activity fields in a prepared-xact PGPROC are unused,
+ * but its myProcLocks[] lists are valid.
+ */
+struct PGPROC
+{
+	/* proc->links MUST BE FIRST IN STRUCT (see ProcSleep,ProcWakeup,etc) */
+	SHM_QUEUE	links;			/* list link if process is in a list */
+
+	PGSemaphoreData sem;		/* ONE semaphore to sleep on */
+	int			waitStatus;		/* STATUS_WAITING, STATUS_OK or STATUS_ERROR */
+
+	Latch		procLatch;		/* generic latch for process */
+
+	LocalTransactionId lxid;	/* local id of top-level transaction currently
+								 * being executed by this proc, if running;
+								 * else InvalidLocalTransactionId */
+	int			pid;			/* Backend's process ID; 0 if prepared xact */
+	int			pgprocno;
+
+	/* These fields are zero while a backend is still starting up: */
+	BackendId	backendId;		/* This backend's backend ID (if assigned) */
+	Oid			databaseId;		/* OID of database this backend is using */
+	Oid			roleId;			/* OID of role using this backend */
+
+	/*
+	 * While in hot standby mode, shows that a conflict signal has been sent
+	 * for the current transaction. Set/cleared while holding ProcArrayLock,
+	 * though not required. Accessed without lock, if needed.
+	 */
+	bool		recoveryConflictPending;
+
+	/* Info about LWLock the process is currently waiting for, if any. */
+	bool		lwWaiting;		/* true if waiting for an LW lock */
+	uint8		lwWaitMode;		/* lwlock mode being waited for */
+	struct PGPROC *lwWaitLink;	/* next waiter for same LW lock */
+
+	/* Info about lock the process is currently waiting for, if any. */
+	/* waitLock and waitProcLock are NULL if not currently waiting. */
+	LOCK	   *waitLock;		/* Lock object we're sleeping on ... */
+	PROCLOCK   *waitProcLock;	/* Per-holder info for awaited lock */
+	LOCKMODE	waitLockMode;	/* type of lock we're waiting for */
+	LOCKMASK	heldLocks;		/* bitmask for lock types already held on this
+								 * lock object by this backend */
+
+	/*
+	 * Info to allow us to wait for synchronous replication, if needed.
+	 * waitLSN is InvalidXLogRecPtr if not waiting; set only by user backend.
+	 * syncRepState must not be touched except by owning process or WALSender.
+	 * syncRepLinks used only while holding SyncRepLock.
+	 */
+	XLogRecPtr	waitLSN;		/* waiting for this LSN or higher */
+	int			syncRepState;	/* wait state for sync rep */
+	SHM_QUEUE	syncRepLinks;	/* list link if process is in syncrep queue */
+
+	/*
+	 * All PROCLOCK objects for locks held or awaited by this backend are
+	 * linked into one of these lists, according to the partition number of
+	 * their lock.
+	 */
+	SHM_QUEUE	myProcLocks[NUM_LOCK_PARTITIONS];
+
+	struct XidCache subxids;	/* cache for subtransaction XIDs */
+
+	/* Per-backend LWLock.	Protects fields below. */
+	LWLockId	backendLock;	/* protects the fields below */
+
+	/* Lock manager data, recording fast-path locks taken by this backend. */
+	uint64		fpLockBits;		/* lock modes held for each fast-path slot */
+	Oid			fpRelId[FP_LOCK_SLOTS_PER_BACKEND];		/* slots for rel oids */
+	bool		fpVXIDLock;		/* are we holding a fast-path VXID lock? */
+	LocalTransactionId fpLocalTransactionId;	/* lxid for fast-path VXID
+												 * lock */
+};
+
+/* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */
+
+
+extern PGDLLIMPORT PGPROC *MyProc;
+extern PGDLLIMPORT struct PGXACT *MyPgXact;
+
+/*
+ * Prior to PostgreSQL 9.2, the fields below were stored as part of the
+ * PGPROC.	However, benchmarking revealed that packing these particular
+ * members into a separate array as tightly as possible sped up GetSnapshotData
+ * considerably on systems with many CPU cores, by reducing the number of
+ * cache lines needing to be fetched.  Thus, think very carefully before adding
+ * anything else here.
+ */
+typedef struct PGXACT
+{
+	TransactionId xid;			/* id of top-level transaction currently being
+								 * executed by this proc, if running and XID
+								 * is assigned; else InvalidTransactionId */
+
+	TransactionId xmin;			/* minimal running XID as it was when we were
+								 * starting our xact, excluding LAZY VACUUM:
+								 * vacuum must not remove tuples deleted by
+								 * xid >= xmin ! */
+
+	uint8		vacuumFlags;	/* vacuum-related flags, see above */
+	bool		overflowed;
+	bool		inCommit;		/* true if within commit critical section */
+
+	uint8		nxids;
+} PGXACT;
+
+/*
+ * There is one ProcGlobal struct for the whole database cluster.
+ */
+typedef struct PROC_HDR
+{
+	/* Array of PGPROC structures (not including dummies for prepared txns) */
+	PGPROC	   *allProcs;
+	/* Array of PGXACT structures (not including dummies for prepared txns) */
+	PGXACT	   *allPgXact;
+	/* Length of allProcs array */
+	uint32		allProcCount;
+	/* Head of list of free PGPROC structures */
+	PGPROC	   *freeProcs;
+	/* Head of list of autovacuum's free PGPROC structures */
+	PGPROC	   *autovacFreeProcs;
+	/* WALWriter process's latch */
+	Latch	   *walwriterLatch;
+	/* Checkpointer process's latch */
+	Latch	   *checkpointerLatch;
+	/* Current shared estimate of appropriate spins_per_delay value */
+	int			spins_per_delay;
+	/* The proc of the Startup process, since not in ProcArray */
+	PGPROC	   *startupProc;
+	int			startupProcPid;
+	/* Buffer id of the buffer that Startup process waits for pin on, or -1 */
+	int			startupBufferPinWaitBufId;
+} PROC_HDR;
+
+extern PROC_HDR *ProcGlobal;
+
+extern PGPROC *PreparedXactProcs;
+
+/*
+ * We set aside some extra PGPROC structures for auxiliary processes,
+ * ie things that aren't full-fledged backends but need shmem access.
+ *
+ * Background writer, checkpointer and WAL writer run during normal operation.
+ * Startup process and WAL receiver also consume 2 slots, but WAL writer is
+ * launched only after startup has exited, so we only need 4 slots.
+ */
+#define NUM_AUXILIARY_PROCS		4
+
+
+/* configurable options */
+extern int	DeadlockTimeout;
+extern int	StatementTimeout;
+extern bool log_lock_waits;
+
+extern volatile bool cancel_from_timeout;
+
+
+/*
+ * Function Prototypes
+ */
+extern int	ProcGlobalSemas(void);
+extern Size ProcGlobalShmemSize(void);
+extern void InitProcGlobal(void);
+extern void InitProcess(void);
+extern void InitProcessPhase2(void);
+extern void InitAuxiliaryProcess(void);
+
+extern void PublishStartupProcessInformation(void);
+extern void SetStartupBufferPinWaitBufId(int bufid);
+extern int	GetStartupBufferPinWaitBufId(void);
+
+extern bool HaveNFreeProcs(int n);
+extern void ProcReleaseLocks(bool isCommit);
+
+extern void ProcQueueInit(PROC_QUEUE *queue);
+extern int	ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable);
+extern PGPROC *ProcWakeup(PGPROC *proc, int waitStatus);
+extern void ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock);
+extern bool IsWaitingForLock(void);
+extern void LockErrorCleanup(void);
+
+extern void ProcWaitForSignal(void);
+extern void ProcSendSignal(int pid);
+
+extern bool enable_sig_alarm(int delayms, bool is_statement_timeout);
+extern bool disable_sig_alarm(bool is_statement_timeout);
+extern void handle_sig_alarm(SIGNAL_ARGS);
+
+extern bool enable_standby_sig_alarm(TimestampTz now,
+						 TimestampTz fin_time, bool deadlock_only);
+extern bool disable_standby_sig_alarm(void);
+extern void handle_standby_sig_alarm(SIGNAL_ARGS);
+
+#endif   /* PROC_H */
--- a/pg_include/storage/procarray.h
+++ b/pg_include/storage/procarray.h
@@ -0,0 +1,79 @@
+/*-------------------------------------------------------------------------
+ *
+ * procarray.h
+ *	  POSTGRES process array definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/procarray.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PROCARRAY_H
+#define PROCARRAY_H
+
+#include "storage/standby.h"
+#include "utils/snapshot.h"
+
+
+extern Size ProcArrayShmemSize(void);
+extern void CreateSharedProcArray(void);
+extern void ProcArrayAdd(PGPROC *proc);
+extern void ProcArrayRemove(PGPROC *proc, TransactionId latestXid);
+
+extern void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid);
+extern void ProcArrayClearTransaction(PGPROC *proc);
+
+extern void ProcArrayApplyRecoveryInfo(RunningTransactions running);
+extern void ProcArrayApplyXidAssignment(TransactionId topxid,
+							int nsubxids, TransactionId *subxids);
+
+extern void RecordKnownAssignedTransactionIds(TransactionId xid);
+extern void ExpireTreeKnownAssignedTransactionIds(TransactionId xid,
+									  int nsubxids, TransactionId *subxids,
+									  TransactionId max_xid);
+extern void ExpireAllKnownAssignedTransactionIds(void);
+extern void ExpireOldKnownAssignedTransactionIds(TransactionId xid);
+
+extern int	GetMaxSnapshotXidCount(void);
+extern int	GetMaxSnapshotSubxidCount(void);
+
+extern Snapshot GetSnapshotData(Snapshot snapshot);
+
+extern bool ProcArrayInstallImportedXmin(TransactionId xmin,
+							 TransactionId sourcexid);
+
+extern RunningTransactions GetRunningTransactionData(void);
+
+extern bool TransactionIdIsInProgress(TransactionId xid);
+extern bool TransactionIdIsActive(TransactionId xid);
+extern TransactionId GetOldestXmin(bool allDbs, bool ignoreVacuum);
+extern TransactionId GetOldestActiveTransactionId(void);
+
+extern int	GetTransactionsInCommit(TransactionId **xids_p);
+extern bool HaveTransactionsInCommit(TransactionId *xids, int nxids);
+
+extern PGPROC *BackendPidGetProc(int pid);
+extern int	BackendXidGetPid(TransactionId xid);
+extern bool IsBackendPid(int pid);
+
+extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin,
+					  bool excludeXmin0, bool allDbs, int excludeVacuum,
+					  int *nvxids);
+extern VirtualTransactionId *GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid);
+extern pid_t CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode);
+
+extern bool MinimumActiveBackends(int min);
+extern int	CountDBBackends(Oid databaseid);
+extern void CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending);
+extern int	CountUserBackends(Oid roleid);
+extern bool CountOtherDBBackends(Oid databaseId,
+					 int *nbackends, int *nprepared);
+
+extern void XidCacheRemoveRunningXids(TransactionId xid,
+						  int nxids, const TransactionId *xids,
+						  TransactionId latestXid);
+
+#endif   /* PROCARRAY_H */
--- a/pg_include/storage/procsignal.h
+++ b/pg_include/storage/procsignal.h
@@ -0,0 +1,58 @@
+/*-------------------------------------------------------------------------
+ *
+ * procsignal.h
+ *	  Routines for interprocess signalling
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/procsignal.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PROCSIGNAL_H
+#define PROCSIGNAL_H
+
+#include "storage/backendid.h"
+
+
+/*
+ * Reasons for signalling a Postgres child process (a backend or an auxiliary
+ * process, like checkpointer).  We can cope with concurrent signals for different
+ * reasons.  However, if the same reason is signaled multiple times in quick
+ * succession, the process is likely to observe only one notification of it.
+ * This is okay for the present uses.
+ *
+ * Also, because of race conditions, it's important that all the signals be
+ * defined so that no harm is done if a process mistakenly receives one.
+ */
+typedef enum
+{
+	PROCSIG_CATCHUP_INTERRUPT,	/* sinval catchup interrupt */
+	PROCSIG_NOTIFY_INTERRUPT,	/* listen/notify interrupt */
+
+	/* Recovery conflict reasons */
+	PROCSIG_RECOVERY_CONFLICT_DATABASE,
+	PROCSIG_RECOVERY_CONFLICT_TABLESPACE,
+	PROCSIG_RECOVERY_CONFLICT_LOCK,
+	PROCSIG_RECOVERY_CONFLICT_SNAPSHOT,
+	PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
+	PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK,
+
+	NUM_PROCSIGNALS				/* Must be last! */
+} ProcSignalReason;
+
+/*
+ * prototypes for functions in procsignal.c
+ */
+extern Size ProcSignalShmemSize(void);
+extern void ProcSignalShmemInit(void);
+
+extern void ProcSignalInit(int pss_idx);
+extern int SendProcSignal(pid_t pid, ProcSignalReason reason,
+			   BackendId backendId);
+
+extern void procsignal_sigusr1_handler(SIGNAL_ARGS);
+
+#endif   /* PROCSIGNAL_H */
--- a/pg_include/storage/reinit.h
+++ b/pg_include/storage/reinit.h
@@ -0,0 +1,23 @@
+/*-------------------------------------------------------------------------
+ *
+ * reinit.h
+ *	  Reinitialization of unlogged relations
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/fd.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef REINIT_H
+#define REINIT_H
+
+extern void ResetUnloggedRelations(int op);
+
+#define UNLOGGED_RELATION_CLEANUP		0x0001
+#define UNLOGGED_RELATION_INIT			0x0002
+
+#endif   /* REINIT_H */
--- a/pg_include/storage/relfilenode.h
+++ b/pg_include/storage/relfilenode.h
@@ -0,0 +1,119 @@
+/*-------------------------------------------------------------------------
+ *
+ * relfilenode.h
+ *	  Physical access information for relations.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/relfilenode.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef RELFILENODE_H
+#define RELFILENODE_H
+
+#include "storage/backendid.h"
+
+/*
+ * The physical storage of a relation consists of one or more forks. The
+ * main fork is always created, but in addition to that there can be
+ * additional forks for storing various metadata. ForkNumber is used when
+ * we need to refer to a specific fork in a relation.
+ */
+typedef enum ForkNumber
+{
+	InvalidForkNumber = -1,
+	MAIN_FORKNUM = 0,
+	FSM_FORKNUM,
+	VISIBILITYMAP_FORKNUM,
+	INIT_FORKNUM
+
+	/*
+	 * NOTE: if you add a new fork, change MAX_FORKNUM below and update the
+	 * forkNames array in catalog.c
+	 */
+} ForkNumber;
+
+#define MAX_FORKNUM		INIT_FORKNUM
+
+/*
+ * RelFileNode must provide all that we need to know to physically access
+ * a relation, with the exception of the backend ID, which can be provided
+ * separately. Note, however, that a "physical" relation is comprised of
+ * multiple files on the filesystem, as each fork is stored as a separate
+ * file, and each fork can be divided into multiple segments. See md.c.
+ *
+ * spcNode identifies the tablespace of the relation.  It corresponds to
+ * pg_tablespace.oid.
+ *
+ * dbNode identifies the database of the relation.	It is zero for
+ * "shared" relations (those common to all databases of a cluster).
+ * Nonzero dbNode values correspond to pg_database.oid.
+ *
+ * relNode identifies the specific relation.  relNode corresponds to
+ * pg_class.relfilenode (NOT pg_class.oid, because we need to be able
+ * to assign new physical files to relations in some situations).
+ * Notice that relNode is only unique within a particular database.
+ *
+ * Note: spcNode must be GLOBALTABLESPACE_OID if and only if dbNode is
+ * zero.  We support shared relations only in the "global" tablespace.
+ *
+ * Note: in pg_class we allow reltablespace == 0 to denote that the
+ * relation is stored in its database's "default" tablespace (as
+ * identified by pg_database.dattablespace).  However this shorthand
+ * is NOT allowed in RelFileNode structs --- the real tablespace ID
+ * must be supplied when setting spcNode.
+ *
+ * Note: in pg_class, relfilenode can be zero to denote that the relation
+ * is a "mapped" relation, whose current true filenode number is available
+ * from relmapper.c.  Again, this case is NOT allowed in RelFileNodes.
+ *
+ * Note: various places use RelFileNode in hashtable keys.  Therefore,
+ * there *must not* be any unused padding bytes in this struct.  That
+ * should be safe as long as all the fields are of type Oid.
+ */
+typedef struct RelFileNode
+{
+	Oid			spcNode;		/* tablespace */
+	Oid			dbNode;			/* database */
+	Oid			relNode;		/* relation */
+} RelFileNode;
+
+/*
+ * Augmenting a relfilenode with the backend ID provides all the information
+ * we need to locate the physical storage.  The backend ID is InvalidBackendId
+ * for regular relations (those accessible to more than one backend), or the
+ * owning backend's ID for backend-local relations.  Backend-local relations
+ * are always transient and removed in case of a database crash; they are
+ * never WAL-logged or fsync'd.
+ */
+typedef struct RelFileNodeBackend
+{
+	RelFileNode node;
+	BackendId	backend;
+} RelFileNodeBackend;
+
+#define RelFileNodeBackendIsTemp(rnode) \
+	((rnode).backend != InvalidBackendId)
+
+/*
+ * Note: RelFileNodeEquals and RelFileNodeBackendEquals compare relNode first
+ * since that is most likely to be different in two unequal RelFileNodes.  It
+ * is probably redundant to compare spcNode if the other fields are found equal,
+ * but do it anyway to be sure.  Likewise for checking the backend ID in
+ * RelFileNodeBackendEquals.
+ */
+#define RelFileNodeEquals(node1, node2) \
+	((node1).relNode == (node2).relNode && \
+	 (node1).dbNode == (node2).dbNode && \
+	 (node1).spcNode == (node2).spcNode)
+
+#define RelFileNodeBackendEquals(node1, node2) \
+	((node1).node.relNode == (node2).node.relNode && \
+	 (node1).node.dbNode == (node2).node.dbNode && \
+	 (node1).backend == (node2).backend && \
+	 (node1).node.spcNode == (node2).node.spcNode)
+
+#endif   /* RELFILENODE_H */
--- a/pg_include/storage/s_lock.h
+++ b/pg_include/storage/s_lock.h
--- a/pg_include/storage/shmem.h
+++ b/pg_include/storage/shmem.h
@@ -0,0 +1,78 @@
+/*-------------------------------------------------------------------------
+ *
+ * shmem.h
+ *	  shared memory management structures
+ *
+ * Historical note:
+ * A long time ago, Postgres' shared memory region was allowed to be mapped
+ * at a different address in each process, and shared memory "pointers" were
+ * passed around as offsets relative to the start of the shared memory region.
+ * That is no longer the case: each process must map the shared memory region
+ * at the same address.  This means shared memory pointers can be passed
+ * around directly between different processes.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/shmem.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SHMEM_H
+#define SHMEM_H
+
+#include "utils/hsearch.h"
+
+
+/* shmqueue.c */
+typedef struct SHM_QUEUE
+{
+	struct SHM_QUEUE *prev;
+	struct SHM_QUEUE *next;
+} SHM_QUEUE;
+
+/* shmem.c */
+extern void InitShmemAccess(void *seghdr);
+extern void InitShmemAllocation(void);
+extern void *ShmemAlloc(Size size);
+extern bool ShmemAddrIsValid(const void *addr);
+extern void InitShmemIndex(void);
+extern HTAB *ShmemInitHash(const char *name, long init_size, long max_size,
+			  HASHCTL *infoP, int hash_flags);
+extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr);
+extern Size add_size(Size s1, Size s2);
+extern Size mul_size(Size s1, Size s2);
+
+/* ipci.c */
+extern void RequestAddinShmemSpace(Size size);
+
+/* size constants for the shmem index table */
+ /* max size of data structure string name */
+#define SHMEM_INDEX_KEYSIZE		 (48)
+ /* estimated size of the shmem index table (not a hard limit) */
+#define SHMEM_INDEX_SIZE		 (64)
+
+/* this is a hash bucket in the shmem index table */
+typedef struct
+{
+	char		key[SHMEM_INDEX_KEYSIZE];		/* string name */
+	void	   *location;		/* location in shared mem */
+	Size		size;			/* # bytes allocated for the structure */
+} ShmemIndexEnt;
+
+/*
+ * prototypes for functions in shmqueue.c
+ */
+extern void SHMQueueInit(SHM_QUEUE *queue);
+extern void SHMQueueElemInit(SHM_QUEUE *queue);
+extern void SHMQueueDelete(SHM_QUEUE *queue);
+extern void SHMQueueInsertBefore(SHM_QUEUE *queue, SHM_QUEUE *elem);
+extern void SHMQueueInsertAfter(SHM_QUEUE *queue, SHM_QUEUE *elem);
+extern Pointer SHMQueueNext(const SHM_QUEUE *queue, const SHM_QUEUE *curElem,
+			 Size linkOffset);
+extern Pointer SHMQueuePrev(const SHM_QUEUE *queue, const SHM_QUEUE *curElem,
+			 Size linkOffset);
+extern bool SHMQueueEmpty(const SHM_QUEUE *queue);
+extern bool SHMQueueIsDetached(const SHM_QUEUE *queue);
+
+#endif   /* SHMEM_H */
--- a/pg_include/storage/sinval.h
+++ b/pg_include/storage/sinval.h
@@ -0,0 +1,139 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinval.h
+ *	  POSTGRES shared cache invalidation communication definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/sinval.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SINVAL_H
+#define SINVAL_H
+
+#include "storage/relfilenode.h"
+
+
+/*
+ * We support several types of shared-invalidation messages:
+ *	* invalidate a specific tuple in a specific catcache
+ *	* invalidate all catcache entries from a given system catalog
+ *	* invalidate a relcache entry for a specific logical relation
+ *	* invalidate an smgr cache entry for a specific physical relation
+ *	* invalidate the mapped-relation mapping for a given database
+ * More types could be added if needed.  The message type is identified by
+ * the first "int8" field of the message struct.  Zero or positive means a
+ * specific-catcache inval message (and also serves as the catcache ID field).
+ * Negative values identify the other message types, as per codes below.
+ *
+ * Catcache inval events are initially driven by detecting tuple inserts,
+ * updates and deletions in system catalogs (see CacheInvalidateHeapTuple).
+ * An update can generate two inval events, one for the old tuple and one for
+ * the new, but this is reduced to one event if the tuple's hash key doesn't
+ * change.	Note that the inval events themselves don't actually say whether
+ * the tuple is being inserted or deleted.	Also, since we transmit only a
+ * hash key, there is a small risk of unnecessary invalidations due to chance
+ * matches of hash keys.
+ *
+ * Note that some system catalogs have multiple caches on them (with different
+ * indexes).  On detecting a tuple invalidation in such a catalog, separate
+ * catcache inval messages must be generated for each of its caches, since
+ * the hash keys will generally be different.
+ *
+ * Catcache and relcache invalidations are transactional, and so are sent
+ * to other backends upon commit.  Internally to the generating backend,
+ * they are also processed at CommandCounterIncrement so that later commands
+ * in the same transaction see the new state.  The generating backend also
+ * has to process them at abort, to flush out any cache state it's loaded
+ * from no-longer-valid entries.
+ *
+ * smgr and relation mapping invalidations are non-transactional: they are
+ * sent immediately when the underlying file change is made.
+ */
+
+typedef struct
+{
+	int8		id;				/* cache ID --- must be first */
+	Oid			dbId;			/* database ID, or 0 if a shared relation */
+	uint32		hashValue;		/* hash value of key for this catcache */
+} SharedInvalCatcacheMsg;
+
+#define SHAREDINVALCATALOG_ID	(-1)
+
+typedef struct
+{
+	int8		id;				/* type field --- must be first */
+	Oid			dbId;			/* database ID, or 0 if a shared catalog */
+	Oid			catId;			/* ID of catalog whose contents are invalid */
+} SharedInvalCatalogMsg;
+
+#define SHAREDINVALRELCACHE_ID	(-2)
+
+typedef struct
+{
+	int8		id;				/* type field --- must be first */
+	Oid			dbId;			/* database ID, or 0 if a shared relation */
+	Oid			relId;			/* relation ID */
+} SharedInvalRelcacheMsg;
+
+#define SHAREDINVALSMGR_ID		(-3)
+
+typedef struct
+{
+	/* note: field layout chosen to pack into 16 bytes */
+	int8		id;				/* type field --- must be first */
+	int8		backend_hi;		/* high bits of backend ID, if temprel */
+	uint16		backend_lo;		/* low bits of backend ID, if temprel */
+	RelFileNode rnode;			/* spcNode, dbNode, relNode */
+} SharedInvalSmgrMsg;
+
+#define SHAREDINVALRELMAP_ID	(-4)
+
+typedef struct
+{
+	int8		id;				/* type field --- must be first */
+	Oid			dbId;			/* database ID, or 0 for shared catalogs */
+} SharedInvalRelmapMsg;
+
+typedef union
+{
+	int8		id;				/* type field --- must be first */
+	SharedInvalCatcacheMsg cc;
+	SharedInvalCatalogMsg cat;
+	SharedInvalRelcacheMsg rc;
+	SharedInvalSmgrMsg sm;
+	SharedInvalRelmapMsg rm;
+} SharedInvalidationMessage;
+
+
+/* Counter of messages processed; don't worry about overflow. */
+extern uint64 SharedInvalidMessageCounter;
+
+
+extern void SendSharedInvalidMessages(const SharedInvalidationMessage *msgs,
+						  int n);
+extern void ReceiveSharedInvalidMessages(
+					  void (*invalFunction) (SharedInvalidationMessage *msg),
+							 void (*resetFunction) (void));
+
+/* signal handler for catchup events (PROCSIG_CATCHUP_INTERRUPT) */
+extern void HandleCatchupInterrupt(void);
+
+/*
+ * enable/disable processing of catchup events directly from signal handler.
+ * The enable routine first performs processing of any catchup events that
+ * have occurred since the last disable.
+ */
+extern void EnableCatchupInterrupt(void);
+extern bool DisableCatchupInterrupt(void);
+
+extern int xactGetCommittedInvalidationMessages(SharedInvalidationMessage **msgs,
+									 bool *RelcacheInitFileInval);
+extern void ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs,
+									 int nmsgs, bool RelcacheInitFileInval,
+									 Oid dbid, Oid tsid);
+
+#endif   /* SINVAL_H */
--- a/pg_include/storage/sinvaladt.h
+++ b/pg_include/storage/sinvaladt.h
@@ -0,0 +1,42 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinvaladt.h
+ *	  POSTGRES shared cache invalidation data manager.
+ *
+ * The shared cache invalidation manager is responsible for transmitting
+ * invalidation messages between backends.	Any message sent by any backend
+ * must be delivered to all already-running backends before it can be
+ * forgotten.  (If we run out of space, we instead deliver a "RESET"
+ * message to backends that have fallen too far behind.)
+ *
+ * The struct type SharedInvalidationMessage, defining the contents of
+ * a single message, is defined in sinval.h.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/sinvaladt.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SINVALADT_H
+#define SINVALADT_H
+
+#include "storage/proc.h"
+#include "storage/sinval.h"
+
+/*
+ * prototypes for functions in sinvaladt.c
+ */
+extern Size SInvalShmemSize(void);
+extern void CreateSharedInvalidationState(void);
+extern void SharedInvalBackendInit(bool sendOnly);
+extern PGPROC *BackendIdGetProc(int backendID);
+
+extern void SIInsertDataEntries(const SharedInvalidationMessage *data, int n);
+extern int	SIGetDataEntries(SharedInvalidationMessage *data, int datasize);
+extern void SICleanupQueue(bool callerHasWriteLock, int minFree);
+
+extern LocalTransactionId GetNextLocalTransactionId(void);
+
+#endif   /* SINVALADT_H */
--- a/pg_include/storage/smgr.h
+++ b/pg_include/storage/smgr.h
@@ -0,0 +1,143 @@
+/*-------------------------------------------------------------------------
+ *
+ * smgr.h
+ *	  storage manager switch public interface declarations.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/smgr.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SMGR_H
+#define SMGR_H
+
+#include "fmgr.h"
+#include "storage/block.h"
+#include "storage/relfilenode.h"
+
+
+/*
+ * smgr.c maintains a table of SMgrRelation objects, which are essentially
+ * cached file handles.  An SMgrRelation is created (if not already present)
+ * by smgropen(), and destroyed by smgrclose().  Note that neither of these
+ * operations imply I/O, they just create or destroy a hashtable entry.
+ * (But smgrclose() may release associated resources, such as OS-level file
+ * descriptors.)
+ *
+ * An SMgrRelation may have an "owner", which is just a pointer to it from
+ * somewhere else; smgr.c will clear this pointer if the SMgrRelation is
+ * closed.	We use this to avoid dangling pointers from relcache to smgr
+ * without having to make the smgr explicitly aware of relcache.  There
+ * can't be more than one "owner" pointer per SMgrRelation, but that's
+ * all we need.
+ *
+ * SMgrRelations that do not have an "owner" are considered to be transient,
+ * and are deleted at end of transaction.
+ */
+typedef struct SMgrRelationData
+{
+	/* rnode is the hashtable lookup key, so it must be first! */
+	RelFileNodeBackend smgr_rnode;		/* relation physical identifier */
+
+	/* pointer to owning pointer, or NULL if none */
+	struct SMgrRelationData **smgr_owner;
+
+	/*
+	 * These next three fields are not actually used or manipulated by smgr,
+	 * except that they are reset to InvalidBlockNumber upon a cache flush
+	 * event (in particular, upon truncation of the relation).	Higher levels
+	 * store cached state here so that it will be reset when truncation
+	 * happens.  In all three cases, InvalidBlockNumber means "unknown".
+	 */
+	BlockNumber smgr_targblock; /* current insertion target block */
+	BlockNumber smgr_fsm_nblocks;		/* last known size of fsm fork */
+	BlockNumber smgr_vm_nblocks;	/* last known size of vm fork */
+
+	/* additional public fields may someday exist here */
+
+	/*
+	 * Fields below here are intended to be private to smgr.c and its
+	 * submodules.	Do not touch them from elsewhere.
+	 */
+	int			smgr_which;		/* storage manager selector */
+
+	/* for md.c; NULL for forks that are not open */
+	struct _MdfdVec *md_fd[MAX_FORKNUM + 1];
+
+	/* if unowned, list link in list of all unowned SMgrRelations */
+	struct SMgrRelationData *next_unowned_reln;
+} SMgrRelationData;
+
+typedef SMgrRelationData *SMgrRelation;
+
+#define SmgrIsTemp(smgr) \
+	RelFileNodeBackendIsTemp((smgr)->smgr_rnode)
+
+extern void smgrinit(void);
+extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend);
+extern bool smgrexists(SMgrRelation reln, ForkNumber forknum);
+extern void smgrsetowner(SMgrRelation *owner, SMgrRelation reln);
+extern void smgrclose(SMgrRelation reln);
+extern void smgrcloseall(void);
+extern void smgrclosenode(RelFileNodeBackend rnode);
+extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+extern void smgrdounlink(SMgrRelation reln, bool isRedo);
+extern void smgrdounlinkfork(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
+		   BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void smgrprefetch(SMgrRelation reln, ForkNumber forknum,
+			 BlockNumber blocknum);
+extern void smgrread(SMgrRelation reln, ForkNumber forknum,
+		 BlockNumber blocknum, char *buffer);
+extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
+		  BlockNumber blocknum, char *buffer, bool skipFsync);
+extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
+extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum,
+			 BlockNumber nblocks);
+extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
+extern void smgrpreckpt(void);
+extern void smgrsync(void);
+extern void smgrpostckpt(void);
+extern void AtEOXact_SMgr(void);
+
+
+/* internals: move me elsewhere -- ay 7/94 */
+
+/* in md.c */
+extern void mdinit(void);
+extern void mdclose(SMgrRelation reln, ForkNumber forknum);
+extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
+extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
+extern void mdextend(SMgrRelation reln, ForkNumber forknum,
+		 BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
+		   BlockNumber blocknum);
+extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+	   char *buffer);
+extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
+		BlockNumber blocknum, char *buffer, bool skipFsync);
+extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
+extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
+		   BlockNumber nblocks);
+extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
+extern void mdpreckpt(void);
+extern void mdsync(void);
+extern void mdpostckpt(void);
+
+extern void SetForwardFsyncRequests(void);
+extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum,
+					 BlockNumber segno);
+extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum);
+extern void ForgetDatabaseFsyncRequests(Oid dbid);
+
+/* smgrtype.c */
+extern Datum smgrout(PG_FUNCTION_ARGS);
+extern Datum smgrin(PG_FUNCTION_ARGS);
+extern Datum smgreq(PG_FUNCTION_ARGS);
+extern Datum smgrne(PG_FUNCTION_ARGS);
+
+#endif   /* SMGR_H */
--- a/pg_include/storage/spin.h
+++ b/pg_include/storage/spin.h
@@ -0,0 +1,73 @@
+/*-------------------------------------------------------------------------
+ *
+ * spin.h
+ *	   Hardware-independent implementation of spinlocks.
+ *
+ *
+ *	The hardware-independent interface to spinlocks is defined by the
+ *	typedef "slock_t" and these macros:
+ *
+ *	void SpinLockInit(volatile slock_t *lock)
+ *		Initialize a spinlock (to the unlocked state).
+ *
+ *	void SpinLockAcquire(volatile slock_t *lock)
+ *		Acquire a spinlock, waiting if necessary.
+ *		Time out and abort() if unable to acquire the lock in a
+ *		"reasonable" amount of time --- typically ~ 1 minute.
+ *
+ *	void SpinLockRelease(volatile slock_t *lock)
+ *		Unlock a previously acquired lock.
+ *
+ *	bool SpinLockFree(slock_t *lock)
+ *		Tests if the lock is free. Returns TRUE if free, FALSE if locked.
+ *		This does *not* change the state of the lock.
+ *
+ *	Callers must beware that the macro argument may be evaluated multiple
+ *	times!
+ *
+ *	CAUTION: Care must be taken to ensure that loads and stores of
+ *	shared memory values are not rearranged around spinlock acquire
+ *	and release. This is done using the "volatile" qualifier: the C
+ *	standard states that loads and stores of volatile objects cannot
+ *	be rearranged *with respect to other volatile objects*. The
+ *	spinlock is always written through a volatile pointer by the
+ *	spinlock macros, but this is not sufficient by itself: code that
+ *	protects shared data with a spinlock MUST reference that shared
+ *	data through a volatile pointer.
+ *
+ *	Keep in mind the coding rule that spinlocks must not be held for more
+ *	than a few instructions.  In particular, we assume it is not possible
+ *	for a CHECK_FOR_INTERRUPTS() to occur while holding a spinlock, and so
+ *	it is not necessary to do HOLD/RESUME_INTERRUPTS() in these macros.
+ *
+ *	These macros are implemented in terms of hardware-dependent macros
+ *	supplied by s_lock.h.  There is not currently any extra functionality
+ *	added by this header, but there has been in the past and may someday
+ *	be again.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/spin.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SPIN_H
+#define SPIN_H
+
+#include "storage/s_lock.h"
+
+
+#define SpinLockInit(lock)	S_INIT_LOCK(lock)
+
+#define SpinLockAcquire(lock) S_LOCK(lock)
+
+#define SpinLockRelease(lock) S_UNLOCK(lock)
+
+#define SpinLockFree(lock)	S_LOCK_FREE(lock)
+
+
+extern int	SpinlockSemas(void);
+
+#endif   /* SPIN_H */
--- a/pg_include/storage/standby.h
+++ b/pg_include/storage/standby.h
@@ -0,0 +1,115 @@
+/*-------------------------------------------------------------------------
+ *
+ * standby.h
+ *	  Definitions for hot standby mode.
+ *
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/standby.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef STANDBY_H
+#define STANDBY_H
+
+#include "access/xlog.h"
+#include "storage/lock.h"
+#include "storage/procsignal.h"
+#include "storage/relfilenode.h"
+
+/* User-settable GUC parameters */
+extern int	vacuum_defer_cleanup_age;
+extern int	max_standby_archive_delay;
+extern int	max_standby_streaming_delay;
+
+extern void InitRecoveryTransactionEnvironment(void);
+extern void ShutdownRecoveryTransactionEnvironment(void);
+
+extern void ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid,
+									RelFileNode node);
+extern void ResolveRecoveryConflictWithTablespace(Oid tsid);
+extern void ResolveRecoveryConflictWithDatabase(Oid dbid);
+
+extern void ResolveRecoveryConflictWithBufferPin(void);
+extern void SendRecoveryConflictWithBufferPin(ProcSignalReason reason);
+extern void CheckRecoveryConflictDeadlock(void);
+
+/*
+ * Standby Rmgr (RM_STANDBY_ID)
+ *
+ * Standby recovery manager exists to perform actions that are required
+ * to make hot standby work. That includes logging AccessExclusiveLocks taken
+ * by transactions and running-xacts snapshots.
+ */
+extern void StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid);
+extern void StandbyReleaseLockTree(TransactionId xid,
+					   int nsubxids, TransactionId *subxids);
+extern void StandbyReleaseAllLocks(void);
+extern void StandbyReleaseOldLocks(int nxids, TransactionId *xids);
+
+/*
+ * XLOG message types
+ */
+#define XLOG_STANDBY_LOCK			0x00
+#define XLOG_RUNNING_XACTS			0x10
+
+typedef struct xl_standby_locks
+{
+	int			nlocks;			/* number of entries in locks array */
+	xl_standby_lock locks[1];	/* VARIABLE LENGTH ARRAY */
+} xl_standby_locks;
+
+/*
+ * When we write running xact data to WAL, we use this structure.
+ */
+typedef struct xl_running_xacts
+{
+	int			xcnt;			/* # of xact ids in xids[] */
+	bool		subxid_overflow;	/* snapshot overflowed, subxids missing */
+	TransactionId nextXid;		/* copy of ShmemVariableCache->nextXid */
+	TransactionId oldestRunningXid;		/* *not* oldestXmin */
+	TransactionId latestCompletedXid;	/* so we can set xmax */
+
+	TransactionId xids[1];		/* VARIABLE LENGTH ARRAY */
+} xl_running_xacts;
+
+#define MinSizeOfXactRunningXacts offsetof(xl_running_xacts, xids)
+
+
+/* Recovery handlers for the Standby Rmgr (RM_STANDBY_ID) */
+extern void standby_redo(XLogRecPtr lsn, XLogRecord *record);
+extern void standby_desc(StringInfo buf, uint8 xl_info, char *rec);
+
+/*
+ * Declarations for GetRunningTransactionData(). Similar to Snapshots, but
+ * not quite. This has nothing at all to do with visibility on this server,
+ * so this is completely separate from snapmgr.c and snapmgr.h.
+ * This data is important for creating the initial snapshot state on a
+ * standby server. We need lots more information than a normal snapshot,
+ * hence we use a specific data structure for our needs. This data
+ * is written to WAL as a separate record immediately after each
+ * checkpoint. That means that wherever we start a standby from we will
+ * almost immediately see the data we need to begin executing queries.
+ */
+
+typedef struct RunningTransactionsData
+{
+	int			xcnt;			/* # of xact ids in xids[] */
+	bool		subxid_overflow;	/* snapshot overflowed, subxids missing */
+	TransactionId nextXid;		/* copy of ShmemVariableCache->nextXid */
+	TransactionId oldestRunningXid;		/* *not* oldestXmin */
+	TransactionId latestCompletedXid;	/* so we can set xmax */
+
+	TransactionId *xids;		/* array of (sub)xids still running */
+} RunningTransactionsData;
+
+typedef RunningTransactionsData *RunningTransactions;
+
+extern void LogAccessExclusiveLock(Oid dbOid, Oid relOid);
+extern void LogAccessExclusiveLockPrepare(void);
+
+extern void LogStandbySnapshot(void);
+
+#endif   /* STANDBY_H */