diff options
Diffstat (limited to 'ext/pdo_sqlite/sqlite/src/btree.c')
| -rw-r--r-- | ext/pdo_sqlite/sqlite/src/btree.c | 3966 |
1 files changed, 2115 insertions, 1851 deletions
diff --git a/ext/pdo_sqlite/sqlite/src/btree.c b/ext/pdo_sqlite/sqlite/src/btree.c index 52bc749a32..c83b3ec8d8 100644 --- a/ext/pdo_sqlite/sqlite/src/btree.c +++ b/ext/pdo_sqlite/sqlite/src/btree.c @@ -12,490 +12,64 @@ ** $Id$ ** ** This file implements a external (disk-based) database using BTrees. -** For a detailed discussion of BTrees, refer to -** -** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3: -** "Sorting And Searching", pages 473-480. Addison-Wesley -** Publishing Company, Reading, Massachusetts. -** -** The basic idea is that each page of the file contains N database -** entries and N+1 pointers to subpages. -** -** ---------------------------------------------------------------- -** | Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) | -** ---------------------------------------------------------------- -** -** All of the keys on the page that Ptr(0) points to have values less -** than Key(0). All of the keys on page Ptr(1) and its subpages have -** values greater than Key(0) and less than Key(1). All of the keys -** on Ptr(N+1) and its subpages have values greater than Key(N). And -** so forth. -** -** Finding a particular key requires reading O(log(M)) pages from the -** disk where M is the number of entries in the tree. -** -** In this implementation, a single file can hold one or more separate -** BTrees. Each BTree is identified by the index of its root page. The -** key and data for any entry are combined to form the "payload". A -** fixed amount of payload can be carried directly on the database -** page. If the payload is larger than the preset amount then surplus -** bytes are stored on overflow pages. The payload for an entry -** and the preceding pointer are combined to form a "Cell". Each -** page has a small header which contains the Ptr(N+1) pointer and other -** information such as the size of key and data. -** -** FORMAT DETAILS -** -** The file is divided into pages. The first page is called page 1, -** the second is page 2, and so forth. A page number of zero indicates -** "no such page". The page size can be anything between 512 and 65536. -** Each page can be either a btree page, a freelist page or an overflow -** page. -** -** The first page is always a btree page. The first 100 bytes of the first -** page contain a special header (the "file header") that describes the file. -** The format of the file header is as follows: -** -** OFFSET SIZE DESCRIPTION -** 0 16 Header string: "SQLite format 3\000" -** 16 2 Page size in bytes. -** 18 1 File format write version -** 19 1 File format read version -** 20 1 Bytes of unused space at the end of each page -** 21 1 Max embedded payload fraction -** 22 1 Min embedded payload fraction -** 23 1 Min leaf payload fraction -** 24 4 File change counter -** 28 4 Reserved for future use -** 32 4 First freelist page -** 36 4 Number of freelist pages in the file -** 40 60 15 4-byte meta values passed to higher layers -** -** All of the integer values are big-endian (most significant byte first). -** -** The file change counter is incremented when the database is changed more -** than once within the same second. This counter, together with the -** modification time of the file, allows other processes to know -** when the file has changed and thus when they need to flush their -** cache. -** -** The max embedded payload fraction is the amount of the total usable -** space in a page that can be consumed by a single cell for standard -** B-tree (non-LEAFDATA) tables. A value of 255 means 100%. The default -** is to limit the maximum cell size so that at least 4 cells will fit -** on one page. Thus the default max embedded payload fraction is 64. -** -** If the payload for a cell is larger than the max payload, then extra -** payload is spilled to overflow pages. Once an overflow page is allocated, -** as many bytes as possible are moved into the overflow pages without letting -** the cell size drop below the min embedded payload fraction. -** -** The min leaf payload fraction is like the min embedded payload fraction -** except that it applies to leaf nodes in a LEAFDATA tree. The maximum -** payload fraction for a LEAFDATA tree is always 100% (or 255) and it -** not specified in the header. -** -** Each btree pages is divided into three sections: The header, the -** cell pointer array, and the cell area area. Page 1 also has a 100-byte -** file header that occurs before the page header. -** -** |----------------| -** | file header | 100 bytes. Page 1 only. -** |----------------| -** | page header | 8 bytes for leaves. 12 bytes for interior nodes -** |----------------| -** | cell pointer | | 2 bytes per cell. Sorted order. -** | array | | Grows downward -** | | v -** |----------------| -** | unallocated | -** | space | -** |----------------| ^ Grows upwards -** | cell content | | Arbitrary order interspersed with freeblocks. -** | area | | and free space fragments. -** |----------------| -** -** The page headers looks like this: -** -** OFFSET SIZE DESCRIPTION -** 0 1 Flags. 1: intkey, 2: zerodata, 4: leafdata, 8: leaf -** 1 2 byte offset to the first freeblock -** 3 2 number of cells on this page -** 5 2 first byte of the cell content area -** 7 1 number of fragmented free bytes -** 8 4 Right child (the Ptr(N+1) value). Omitted on leaves. -** -** The flags define the format of this btree page. The leaf flag means that -** this page has no children. The zerodata flag means that this page carries -** only keys and no data. The intkey flag means that the key is a integer -** which is stored in the key size entry of the cell header rather than in -** the payload area. -** -** The cell pointer array begins on the first byte after the page header. -** The cell pointer array contains zero or more 2-byte numbers which are -** offsets from the beginning of the page to the cell content in the cell -** content area. The cell pointers occur in sorted order. The system strives -** to keep free space after the last cell pointer so that new cells can -** be easily added without having to defragment the page. -** -** Cell content is stored at the very end of the page and grows toward the -** beginning of the page. -** -** Unused space within the cell content area is collected into a linked list of -** freeblocks. Each freeblock is at least 4 bytes in size. The byte offset -** to the first freeblock is given in the header. Freeblocks occur in -** increasing order. Because a freeblock must be at least 4 bytes in size, -** any group of 3 or fewer unused bytes in the cell content area cannot -** exist on the freeblock chain. A group of 3 or fewer free bytes is called -** a fragment. The total number of bytes in all fragments is recorded. -** in the page header at offset 7. -** -** SIZE DESCRIPTION -** 2 Byte offset of the next freeblock -** 2 Bytes in this freeblock -** -** Cells are of variable length. Cells are stored in the cell content area at -** the end of the page. Pointers to the cells are in the cell pointer array -** that immediately follows the page header. Cells is not necessarily -** contiguous or in order, but cell pointers are contiguous and in order. -** -** Cell content makes use of variable length integers. A variable -** length integer is 1 to 9 bytes where the lower 7 bits of each -** byte are used. The integer consists of all bytes that have bit 8 set and -** the first byte with bit 8 clear. The most significant byte of the integer -** appears first. A variable-length integer may not be more than 9 bytes long. -** As a special case, all 8 bytes of the 9th byte are used as data. This -** allows a 64-bit integer to be encoded in 9 bytes. -** -** 0x00 becomes 0x00000000 -** 0x7f becomes 0x0000007f -** 0x81 0x00 becomes 0x00000080 -** 0x82 0x00 becomes 0x00000100 -** 0x80 0x7f becomes 0x0000007f -** 0x8a 0x91 0xd1 0xac 0x78 becomes 0x12345678 -** 0x81 0x81 0x81 0x81 0x01 becomes 0x10204081 -** -** Variable length integers are used for rowids and to hold the number of -** bytes of key and data in a btree cell. -** -** The content of a cell looks like this: -** -** SIZE DESCRIPTION -** 4 Page number of the left child. Omitted if leaf flag is set. -** var Number of bytes of data. Omitted if the zerodata flag is set. -** var Number of bytes of key. Or the key itself if intkey flag is set. -** * Payload -** 4 First page of the overflow chain. Omitted if no overflow -** -** Overflow pages form a linked list. Each page except the last is completely -** filled with data (pagesize - 4 bytes). The last page can have as little -** as 1 byte of data. -** -** SIZE DESCRIPTION -** 4 Page number of next overflow page -** * Data -** -** Freelist pages come in two subtypes: trunk pages and leaf pages. The -** file header points to first in a linked list of trunk page. Each trunk -** page points to multiple leaf pages. The content of a leaf page is -** unspecified. A trunk page looks like this: -** -** SIZE DESCRIPTION -** 4 Page number of next trunk page -** 4 Number of leaf pointers on this page -** * zero or more pages numbers of leaves -*/ -#include "sqliteInt.h" -#include "pager.h" -#include "btree.h" -#include "os.h" -#include <assert.h> - -/* Round up a number to the next larger multiple of 8. This is used -** to force 8-byte alignment on 64-bit architectures. +** See the header comment on "btreeInt.h" for additional information. +** Including a description of file format and an overview of operation. */ -#define ROUND8(x) ((x+7)&~7) - - -/* The following value is the maximum cell size assuming a maximum page -** size give above. -*/ -#define MX_CELL_SIZE(pBt) (pBt->pageSize-8) - -/* The maximum number of cells on a single page of the database. This -** assumes a minimum cell size of 3 bytes. Such small cells will be -** exceedingly rare, but they are possible. -*/ -#define MX_CELL(pBt) ((pBt->pageSize-8)/3) - -/* Forward declarations */ -typedef struct MemPage MemPage; -typedef struct BtLock BtLock; - -/* -** This is a magic string that appears at the beginning of every -** SQLite database in order to identify the file as a real database. -** -** You can change this value at compile-time by specifying a -** -DSQLITE_FILE_HEADER="..." on the compiler command-line. The -** header must be exactly 16 bytes including the zero-terminator so -** the string itself should be 15 characters long. If you change -** the header, then your custom library will not be able to read -** databases generated by the standard tools and the standard tools -** will not be able to read databases created by your custom library. -*/ -#ifndef SQLITE_FILE_HEADER /* 123456789 123456 */ -# define SQLITE_FILE_HEADER "SQLite format 3" -#endif -static const char zMagicHeader[] = SQLITE_FILE_HEADER; - -/* -** Page type flags. An ORed combination of these flags appear as the -** first byte of every BTree page. -*/ -#define PTF_INTKEY 0x01 -#define PTF_ZERODATA 0x02 -#define PTF_LEAFDATA 0x04 -#define PTF_LEAF 0x08 - -/* -** As each page of the file is loaded into memory, an instance of the following -** structure is appended and initialized to zero. This structure stores -** information about the page that is decoded from the raw file page. -** -** The pParent field points back to the parent page. This allows us to -** walk up the BTree from any leaf to the root. Care must be taken to -** unref() the parent page pointer when this page is no longer referenced. -** The pageDestructor() routine handles that chore. -*/ -struct MemPage { - u8 isInit; /* True if previously initialized. MUST BE FIRST! */ - u8 idxShift; /* True if Cell indices have changed */ - u8 nOverflow; /* Number of overflow cell bodies in aCell[] */ - u8 intKey; /* True if intkey flag is set */ - u8 leaf; /* True if leaf flag is set */ - u8 zeroData; /* True if table stores keys only */ - u8 leafData; /* True if tables stores data on leaves only */ - u8 hasData; /* True if this page stores data */ - u8 hdrOffset; /* 100 for page 1. 0 otherwise */ - u8 childPtrSize; /* 0 if leaf==1. 4 if leaf==0 */ - u16 maxLocal; /* Copy of Btree.maxLocal or Btree.maxLeaf */ - u16 minLocal; /* Copy of Btree.minLocal or Btree.minLeaf */ - u16 cellOffset; /* Index in aData of first cell pointer */ - u16 idxParent; /* Index in parent of this node */ - u16 nFree; /* Number of free bytes on the page */ - u16 nCell; /* Number of cells on this page, local and ovfl */ - struct _OvflCell { /* Cells that will not fit on aData[] */ - u8 *pCell; /* Pointers to the body of the overflow cell */ - u16 idx; /* Insert this cell before idx-th non-overflow cell */ - } aOvfl[5]; - BtShared *pBt; /* Pointer back to BTree structure */ - u8 *aData; /* Pointer back to the start of the page */ - Pgno pgno; /* Page number for this page */ - MemPage *pParent; /* The parent of this page. NULL for root */ -}; - -/* -** The in-memory image of a disk page has the auxiliary information appended -** to the end. EXTRA_SIZE is the number of bytes of space needed to hold -** that extra information. -*/ -#define EXTRA_SIZE sizeof(MemPage) - -/* Btree handle */ -struct Btree { - sqlite3 *pSqlite; - BtShared *pBt; - u8 inTrans; /* TRANS_NONE, TRANS_READ or TRANS_WRITE */ -}; - -/* -** Btree.inTrans may take one of the following values. -** -** If the shared-data extension is enabled, there may be multiple users -** of the Btree structure. At most one of these may open a write transaction, -** but any number may have active read transactions. Variable Btree.pDb -** points to the handle that owns any current write-transaction. -*/ -#define TRANS_NONE 0 -#define TRANS_READ 1 -#define TRANS_WRITE 2 - -/* -** Everything we need to know about an open database -*/ -struct BtShared { - Pager *pPager; /* The page cache */ - BtCursor *pCursor; /* A list of all open cursors */ - MemPage *pPage1; /* First page of the database */ - u8 inStmt; /* True if we are in a statement subtransaction */ - u8 readOnly; /* True if the underlying file is readonly */ - u8 maxEmbedFrac; /* Maximum payload as % of total page size */ - u8 minEmbedFrac; /* Minimum payload as % of total page size */ - u8 minLeafFrac; /* Minimum leaf payload as % of total page size */ - u8 pageSizeFixed; /* True if the page size can no longer be changed */ -#ifndef SQLITE_OMIT_AUTOVACUUM - u8 autoVacuum; /* True if database supports auto-vacuum */ -#endif - u16 pageSize; /* Total number of bytes on a page */ - u16 usableSize; /* Number of usable bytes on each page */ - int maxLocal; /* Maximum local payload in non-LEAFDATA tables */ - int minLocal; /* Minimum local payload in non-LEAFDATA tables */ - int maxLeaf; /* Maximum local payload in a LEAFDATA table */ - int minLeaf; /* Minimum local payload in a LEAFDATA table */ - BusyHandler *pBusyHandler; /* Callback for when there is lock contention */ - u8 inTransaction; /* Transaction state */ - int nRef; /* Number of references to this structure */ - int nTransaction; /* Number of open transactions (read + write) */ - void *pSchema; /* Pointer to space allocated by sqlite3BtreeSchema() */ - void (*xFreeSchema)(void*); /* Destructor for BtShared.pSchema */ -#ifndef SQLITE_OMIT_SHARED_CACHE - BtLock *pLock; /* List of locks held on this shared-btree struct */ - BtShared *pNext; /* Next in ThreadData.pBtree linked list */ -#endif -}; - -/* -** An instance of the following structure is used to hold information -** about a cell. The parseCellPtr() function fills in this structure -** based on information extract from the raw disk page. -*/ -typedef struct CellInfo CellInfo; -struct CellInfo { - u8 *pCell; /* Pointer to the start of cell content */ - i64 nKey; /* The key for INTKEY tables, or number of bytes in key */ - u32 nData; /* Number of bytes of data */ - u16 nHeader; /* Size of the cell content header in bytes */ - u16 nLocal; /* Amount of payload held locally */ - u16 iOverflow; /* Offset to overflow page number. Zero if no overflow */ - u16 nSize; /* Size of the cell content on the main b-tree page */ -}; - -/* -** A cursor is a pointer to a particular entry in the BTree. -** The entry is identified by its MemPage and the index in -** MemPage.aCell[] of the entry. -*/ -struct BtCursor { - Btree *pBtree; /* The Btree to which this cursor belongs */ - BtCursor *pNext, *pPrev; /* Forms a linked list of all cursors */ - int (*xCompare)(void*,int,const void*,int,const void*); /* Key comp func */ - void *pArg; /* First arg to xCompare() */ - Pgno pgnoRoot; /* The root page of this tree */ - MemPage *pPage; /* Page that contains the entry */ - int idx; /* Index of the entry in pPage->aCell[] */ - CellInfo info; /* A parse of the cell we are pointing at */ - u8 wrFlag; /* True if writable */ - u8 eState; /* One of the CURSOR_XXX constants (see below) */ -#ifndef SQLITE_OMIT_SHARED_CACHE - void *pKey; /* Saved key that was cursor's last known position */ - i64 nKey; /* Size of pKey, or last integer key */ - int skip; /* (skip<0) -> Prev() is a no-op. (skip>0) -> Next() is */ -#endif -}; +#include "btreeInt.h" /* -** Potential values for BtCursor.eState. The first two values (VALID and -** INVALID) may occur in any build. The third (REQUIRESEEK) may only occur -** if sqlite was compiled without the OMIT_SHARED_CACHE symbol defined. -** -** CURSOR_VALID: -** Cursor points to a valid entry. getPayload() etc. may be called. -** -** CURSOR_INVALID: -** Cursor does not point to a valid entry. This can happen (for example) -** because the table is empty or because BtreeCursorFirst() has not been -** called. -** -** CURSOR_REQUIRESEEK: -** The table that this cursor was opened on still exists, but has been -** modified since the cursor was last used. The cursor position is saved -** in variables BtCursor.pKey and BtCursor.nKey. When a cursor is in -** this state, restoreOrClearCursorPosition() can be called to attempt to -** seek the cursor to the saved position. +** The header string that appears at the beginning of every +** SQLite database. */ -#define CURSOR_INVALID 0 -#define CURSOR_VALID 1 -#define CURSOR_REQUIRESEEK 2 +static const char zMagicHeader[] = SQLITE_FILE_HEADER; /* -** The TRACE macro will print high-level status information about the -** btree operation when the global variable sqlite3_btree_trace is -** enabled. +** Set this global variable to 1 to enable tracing using the TRACE +** macro. */ #if SQLITE_TEST -# define TRACE(X) if( sqlite3_btree_trace )\ - { sqlite3DebugPrintf X; fflush(stdout); } int sqlite3_btree_trace=0; /* True to enable tracing */ -#else -# define TRACE(X) #endif -/* -** Forward declaration -*/ -static int checkReadLocks(BtShared*,Pgno,BtCursor*); -/* -** Read or write a two- and four-byte big-endian integer values. -*/ -static u32 get2byte(unsigned char *p){ - return (p[0]<<8) | p[1]; -} -static u32 get4byte(unsigned char *p){ - return (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3]; -} -static void put2byte(unsigned char *p, u32 v){ - p[0] = v>>8; - p[1] = v; -} -static void put4byte(unsigned char *p, u32 v){ - p[0] = v>>24; - p[1] = v>>16; - p[2] = v>>8; - p[3] = v; -} +#ifndef SQLITE_OMIT_SHARED_CACHE /* -** Routines to read and write variable-length integers. These used to -** be defined locally, but now we use the varint routines in the util.c -** file. +** A flag to indicate whether or not shared cache is enabled. Also, +** a list of BtShared objects that are eligible for participation +** in shared cache. The variables have file scope during normal builds, +** but the test harness needs to access these variables so we make them +** global for test builds. */ -#define getVarint sqlite3GetVarint -/* #define getVarint32 sqlite3GetVarint32 */ -#define getVarint32(A,B) ((*B=*(A))<=0x7f?1:sqlite3GetVarint32(A,B)) -#define putVarint sqlite3PutVarint +#ifdef SQLITE_TEST +BtShared *sqlite3SharedCacheList = 0; +int sqlite3SharedCacheEnabled = 0; +#else +static BtShared *sqlite3SharedCacheList = 0; +static int sqlite3SharedCacheEnabled = 0; +#endif +#endif /* SQLITE_OMIT_SHARED_CACHE */ -/* The database page the PENDING_BYTE occupies. This page is never used. -** TODO: This macro is very similary to PAGER_MJ_PGNO() in pager.c. They -** should possibly be consolidated (presumably in pager.h). +#ifndef SQLITE_OMIT_SHARED_CACHE +/* +** Enable or disable the shared pager and schema features. ** -** If disk I/O is omitted (meaning that the database is stored purely -** in memory) then there is no pending byte. +** This routine has no effect on existing database connections. +** The shared cache setting effects only future calls to +** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2(). */ -#ifdef SQLITE_OMIT_DISKIO -# define PENDING_BYTE_PAGE(pBt) 0x7fffffff -#else -# define PENDING_BYTE_PAGE(pBt) ((PENDING_BYTE/(pBt)->pageSize)+1) +int sqlite3_enable_shared_cache(int enable){ + sqlite3SharedCacheEnabled = enable; + return SQLITE_OK; +} #endif + /* -** A linked list of the following structures is stored at BtShared.pLock. -** Locks are added (or upgraded from READ_LOCK to WRITE_LOCK) when a cursor -** is opened on the table with root page BtShared.iTable. Locks are removed -** from this list when a transaction is committed or rolled back, or when -** a btree handle is closed. +** Forward declaration */ -struct BtLock { - Btree *pBtree; /* Btree handle holding this lock */ - Pgno iTable; /* Root page of table */ - u8 eLock; /* READ_LOCK or WRITE_LOCK */ - BtLock *pNext; /* Next in BtShared.pLock list */ -}; +static int checkReadLocks(Btree*,Pgno,BtCursor*); -/* Candidate values for BtLock.eLock */ -#define READ_LOCK 1 -#define WRITE_LOCK 2 #ifdef SQLITE_OMIT_SHARED_CACHE /* @@ -509,106 +83,9 @@ struct BtLock { #define queryTableLock(a,b,c) SQLITE_OK #define lockTable(a,b,c) SQLITE_OK #define unlockAllTables(a) - #define restoreOrClearCursorPosition(a,b) SQLITE_OK - #define saveAllCursors(a,b,c) SQLITE_OK - -#else - -static void releasePage(MemPage *pPage); - -/* -** Save the current cursor position in the variables BtCursor.nKey -** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK. -*/ -static int saveCursorPosition(BtCursor *pCur){ - int rc; - - assert( CURSOR_VALID==pCur->eState ); - assert( 0==pCur->pKey ); - - rc = sqlite3BtreeKeySize(pCur, &pCur->nKey); - - /* If this is an intKey table, then the above call to BtreeKeySize() - ** stores the integer key in pCur->nKey. In this case this value is - ** all that is required. Otherwise, if pCur is not open on an intKey - ** table, then malloc space for and store the pCur->nKey bytes of key - ** data. - */ - if( rc==SQLITE_OK && 0==pCur->pPage->intKey){ - void *pKey = sqliteMalloc(pCur->nKey); - if( pKey ){ - rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey); - if( rc==SQLITE_OK ){ - pCur->pKey = pKey; - }else{ - sqliteFree(pKey); - } - }else{ - rc = SQLITE_NOMEM; - } - } - assert( !pCur->pPage->intKey || !pCur->pKey ); - - if( rc==SQLITE_OK ){ - releasePage(pCur->pPage); - pCur->pPage = 0; - pCur->eState = CURSOR_REQUIRESEEK; - } - - return rc; -} - -/* -** Save the positions of all cursors except pExcept open on the table -** with root-page iRoot. Usually, this is called just before cursor -** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()). -*/ -static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ - BtCursor *p; - if( sqlite3ThreadDataReadOnly()->useSharedData ){ - for(p=pBt->pCursor; p; p=p->pNext){ - if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) && - p->eState==CURSOR_VALID ){ - int rc = saveCursorPosition(p); - if( SQLITE_OK!=rc ){ - return rc; - } - } - } - } - return SQLITE_OK; -} - -/* -** Restore the cursor to the position it was in (or as close to as possible) -** when saveCursorPosition() was called. Note that this call deletes the -** saved position info stored by saveCursorPosition(), so there can be -** at most one effective restoreOrClearCursorPosition() call after each -** saveCursorPosition(). -** -** If the second argument argument - doSeek - is false, then instead of -** returning the cursor to it's saved position, any saved position is deleted -** and the cursor state set to CURSOR_INVALID. -*/ -static int restoreOrClearCursorPositionX(BtCursor *pCur, int doSeek){ - int rc = SQLITE_OK; - assert( sqlite3ThreadDataReadOnly()->useSharedData ); - assert( pCur->eState==CURSOR_REQUIRESEEK ); - pCur->eState = CURSOR_INVALID; - if( doSeek ){ - rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, &pCur->skip); - } - if( rc==SQLITE_OK ){ - sqliteFree(pCur->pKey); - pCur->pKey = 0; - assert( CURSOR_VALID==pCur->eState || CURSOR_INVALID==pCur->eState ); - } - return rc; -} - -#define restoreOrClearCursorPosition(p,x) \ - (p->eState==CURSOR_REQUIRESEEK?restoreOrClearCursorPositionX(p,x):SQLITE_OK) +#endif +#ifndef SQLITE_OMIT_SHARED_CACHE /* ** Query to see if btree handle p may obtain a lock of type eLock ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return @@ -619,11 +96,20 @@ static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){ BtShared *pBt = p->pBt; BtLock *pIter; + assert( sqlite3BtreeHoldsMutex(p) ); + /* This is a no-op if the shared-cache is not enabled */ - if( 0==sqlite3ThreadDataReadOnly()->useSharedData ){ + if( !p->sharable ){ return SQLITE_OK; } + /* If some other connection is holding an exclusive lock, the + ** requested lock may not be obtained. + */ + if( pBt->pExclusive && pBt->pExclusive!=p ){ + return SQLITE_LOCKED; + } + /* This (along with lockTable()) is where the ReadUncommitted flag is ** dealt with. If the caller is querying for a read-lock and the flag is ** set, it is unconditionally granted - even if there are write-locks @@ -639,8 +125,8 @@ static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){ ** write-cursor does not change. */ if( - !p->pSqlite || - 0==(p->pSqlite->flags&SQLITE_ReadUncommitted) || + !p->db || + 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK || iTab==MASTER_ROOT ){ @@ -653,7 +139,9 @@ static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){ } return SQLITE_OK; } +#endif /* !SQLITE_OMIT_SHARED_CACHE */ +#ifndef SQLITE_OMIT_SHARED_CACHE /* ** Add a lock on the table with root-page iTable to the shared-btree used ** by Btree handle p. Parameter eLock must be either READ_LOCK or @@ -667,8 +155,10 @@ static int lockTable(Btree *p, Pgno iTable, u8 eLock){ BtLock *pLock = 0; BtLock *pIter; + assert( sqlite3BtreeHoldsMutex(p) ); + /* This is a no-op if the shared-cache is not enabled */ - if( 0==sqlite3ThreadDataReadOnly()->useSharedData ){ + if( !p->sharable ){ return SQLITE_OK; } @@ -680,8 +170,8 @@ static int lockTable(Btree *p, Pgno iTable, u8 eLock){ ** the ReadUncommitted flag. */ if( - (p->pSqlite) && - (p->pSqlite->flags&SQLITE_ReadUncommitted) && + (p->db) && + (p->db->flags&SQLITE_ReadUncommitted) && (eLock==READ_LOCK) && iTable!=MASTER_ROOT ){ @@ -700,7 +190,7 @@ static int lockTable(Btree *p, Pgno iTable, u8 eLock){ ** with table iTable, allocate one and link it into the list. */ if( !pLock ){ - pLock = (BtLock *)sqliteMalloc(sizeof(BtLock)); + pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock)); if( !pLock ){ return SQLITE_NOMEM; } @@ -721,56 +211,200 @@ static int lockTable(Btree *p, Pgno iTable, u8 eLock){ return SQLITE_OK; } +#endif /* !SQLITE_OMIT_SHARED_CACHE */ +#ifndef SQLITE_OMIT_SHARED_CACHE /* ** Release all the table locks (locks obtained via calls to the lockTable() ** procedure) held by Btree handle p. */ static void unlockAllTables(Btree *p){ - BtLock **ppIter = &p->pBt->pLock; + BtShared *pBt = p->pBt; + BtLock **ppIter = &pBt->pLock; - /* If the shared-cache extension is not enabled, there should be no - ** locks in the BtShared.pLock list, making this procedure a no-op. Assert - ** that this is the case. - */ - assert( sqlite3ThreadDataReadOnly()->useSharedData || 0==*ppIter ); + assert( sqlite3BtreeHoldsMutex(p) ); + assert( p->sharable || 0==*ppIter ); while( *ppIter ){ BtLock *pLock = *ppIter; + assert( pBt->pExclusive==0 || pBt->pExclusive==pLock->pBtree ); if( pLock->pBtree==p ){ *ppIter = pLock->pNext; - sqliteFree(pLock); + sqlite3_free(pLock); }else{ ppIter = &pLock->pNext; } } + + if( pBt->pExclusive==p ){ + pBt->pExclusive = 0; + } } #endif /* SQLITE_OMIT_SHARED_CACHE */ -#ifndef SQLITE_OMIT_AUTOVACUUM +static void releasePage(MemPage *pPage); /* Forward reference */ + /* -** These macros define the location of the pointer-map entry for a -** database page. The first argument to each is the number of usable -** bytes on each page of the database (often 1024). The second is the -** page number to look up in the pointer map. -** -** PTRMAP_PAGENO returns the database page number of the pointer-map -** page that stores the required pointer. PTRMAP_PTROFFSET returns -** the offset of the requested map entry. +** Verify that the cursor holds a mutex on the BtShared +*/ +#ifndef NDEBUG +static int cursorHoldsMutex(BtCursor *p){ + return sqlite3_mutex_held(p->pBt->mutex); +} +#endif + + +#ifndef SQLITE_OMIT_INCRBLOB +/* +** Invalidate the overflow page-list cache for cursor pCur, if any. +*/ +static void invalidateOverflowCache(BtCursor *pCur){ + assert( cursorHoldsMutex(pCur) ); + sqlite3_free(pCur->aOverflow); + pCur->aOverflow = 0; +} + +/* +** Invalidate the overflow page-list cache for all cursors opened +** on the shared btree structure pBt. +*/ +static void invalidateAllOverflowCache(BtShared *pBt){ + BtCursor *p; + assert( sqlite3_mutex_held(pBt->mutex) ); + for(p=pBt->pCursor; p; p=p->pNext){ + invalidateOverflowCache(p); + } +} +#else + #define invalidateOverflowCache(x) + #define invalidateAllOverflowCache(x) +#endif + +/* +** Save the current cursor position in the variables BtCursor.nKey +** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK. +*/ +static int saveCursorPosition(BtCursor *pCur){ + int rc; + + assert( CURSOR_VALID==pCur->eState ); + assert( 0==pCur->pKey ); + assert( cursorHoldsMutex(pCur) ); + + rc = sqlite3BtreeKeySize(pCur, &pCur->nKey); + + /* If this is an intKey table, then the above call to BtreeKeySize() + ** stores the integer key in pCur->nKey. In this case this value is + ** all that is required. Otherwise, if pCur is not open on an intKey + ** table, then malloc space for and store the pCur->nKey bytes of key + ** data. + */ + if( rc==SQLITE_OK && 0==pCur->pPage->intKey){ + void *pKey = sqlite3_malloc(pCur->nKey); + if( pKey ){ + rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey); + if( rc==SQLITE_OK ){ + pCur->pKey = pKey; + }else{ + sqlite3_free(pKey); + } + }else{ + rc = SQLITE_NOMEM; + } + } + assert( !pCur->pPage->intKey || !pCur->pKey ); + + if( rc==SQLITE_OK ){ + releasePage(pCur->pPage); + pCur->pPage = 0; + pCur->eState = CURSOR_REQUIRESEEK; + } + + invalidateOverflowCache(pCur); + return rc; +} + +/* +** Save the positions of all cursors except pExcept open on the table +** with root-page iRoot. Usually, this is called just before cursor +** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()). +*/ +static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ + BtCursor *p; + assert( sqlite3_mutex_held(pBt->mutex) ); + assert( pExcept==0 || pExcept->pBt==pBt ); + for(p=pBt->pCursor; p; p=p->pNext){ + if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) && + p->eState==CURSOR_VALID ){ + int rc = saveCursorPosition(p); + if( SQLITE_OK!=rc ){ + return rc; + } + } + } + return SQLITE_OK; +} + +/* +** Clear the current cursor position. +*/ +static void clearCursorPosition(BtCursor *pCur){ + assert( cursorHoldsMutex(pCur) ); + sqlite3_free(pCur->pKey); + pCur->pKey = 0; + pCur->eState = CURSOR_INVALID; +} + +/* +** Restore the cursor to the position it was in (or as close to as possible) +** when saveCursorPosition() was called. Note that this call deletes the +** saved position info stored by saveCursorPosition(), so there can be +** at most one effective restoreOrClearCursorPosition() call after each +** saveCursorPosition(). ** -** If the pgno argument passed to PTRMAP_PAGENO is a pointer-map page, -** then pgno is returned. So (pgno==PTRMAP_PAGENO(pgsz, pgno)) can be -** used to test if pgno is a pointer-map page. PTRMAP_ISPAGE implements -** this test. +** If the second argument argument - doSeek - is false, then instead of +** returning the cursor to its saved position, any saved position is deleted +** and the cursor state set to CURSOR_INVALID. */ -#define PTRMAP_PAGENO(pBt, pgno) ptrmapPageno(pBt, pgno) -#define PTRMAP_PTROFFSET(pBt, pgno) (5*(pgno-ptrmapPageno(pBt, pgno)-1)) -#define PTRMAP_ISPAGE(pBt, pgno) (PTRMAP_PAGENO((pBt),(pgno))==(pgno)) +int sqlite3BtreeRestoreOrClearCursorPosition(BtCursor *pCur){ + int rc; + assert( cursorHoldsMutex(pCur) ); + assert( pCur->eState>=CURSOR_REQUIRESEEK ); + if( pCur->eState==CURSOR_FAULT ){ + return pCur->skip; + } +#ifndef SQLITE_OMIT_INCRBLOB + if( pCur->isIncrblobHandle ){ + return SQLITE_ABORT; + } +#endif + pCur->eState = CURSOR_INVALID; + rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip); + if( rc==SQLITE_OK ){ + sqlite3_free(pCur->pKey); + pCur->pKey = 0; + assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID ); + } + return rc; +} +#define restoreOrClearCursorPosition(p) \ + (p->eState>=CURSOR_REQUIRESEEK ? \ + sqlite3BtreeRestoreOrClearCursorPosition(p) : \ + SQLITE_OK) + +#ifndef SQLITE_OMIT_AUTOVACUUM +/* +** Given a page number of a regular database page, return the page +** number for the pointer-map page that contains the entry for the +** input page number. +*/ static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ - int nPagesPerMapPage = (pBt->usableSize/5)+1; - int iPtrMap = (pgno-2)/nPagesPerMapPage; - int ret = (iPtrMap*nPagesPerMapPage) + 2; + int nPagesPerMapPage, iPtrMap, ret; + assert( sqlite3_mutex_held(pBt->mutex) ); + nPagesPerMapPage = (pBt->usableSize/5)+1; + iPtrMap = (pgno-2)/nPagesPerMapPage; + ret = (iPtrMap*nPagesPerMapPage) + 2; if( ret==PENDING_BYTE_PAGE(pBt) ){ ret++; } @@ -778,43 +412,6 @@ static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ } /* -** The pointer map is a lookup table that identifies the parent page for -** each child page in the database file. The parent page is the page that -** contains a pointer to the child. Every page in the database contains -** 0 or 1 parent pages. (In this context 'database page' refers -** to any page that is not part of the pointer map itself.) Each pointer map -** entry consists of a single byte 'type' and a 4 byte parent page number. -** The PTRMAP_XXX identifiers below are the valid types. -** -** The purpose of the pointer map is to facility moving pages from one -** position in the file to another as part of autovacuum. When a page -** is moved, the pointer in its parent must be updated to point to the -** new location. The pointer map is used to locate the parent page quickly. -** -** PTRMAP_ROOTPAGE: The database page is a root-page. The page-number is not -** used in this case. -** -** PTRMAP_FREEPAGE: The database page is an unused (free) page. The page-number -** is not used in this case. -** -** PTRMAP_OVERFLOW1: The database page is the first page in a list of -** overflow pages. The page number identifies the page that -** contains the cell with a pointer to this overflow page. -** -** PTRMAP_OVERFLOW2: The database page is the second or later page in a list of -** overflow pages. The page-number identifies the previous -** page in the overflow page list. -** -** PTRMAP_BTREE: The database page is a non-root btree page. The page number -** identifies the parent page in the btree. -*/ -#define PTRMAP_ROOTPAGE 1 -#define PTRMAP_FREEPAGE 2 -#define PTRMAP_OVERFLOW1 3 -#define PTRMAP_OVERFLOW2 4 -#define PTRMAP_BTREE 5 - -/* ** Write an entry into the pointer map. ** ** This routine updates the pointer map entry for page number 'key' @@ -822,11 +419,13 @@ static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ ** An error code is returned if something goes wrong, otherwise SQLITE_OK. */ static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){ - u8 *pPtrmap; /* The pointer map page */ - Pgno iPtrmap; /* The pointer map page number */ - int offset; /* Offset in pointer map page */ + DbPage *pDbPage; /* The pointer map page */ + u8 *pPtrmap; /* The pointer map data */ + Pgno iPtrmap; /* The pointer map page number */ + int offset; /* Offset in pointer map page */ int rc; + assert( sqlite3_mutex_held(pBt->mutex) ); /* The master-journal page number must never be used as a pointer map page */ assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) ); @@ -835,22 +434,23 @@ static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){ return SQLITE_CORRUPT_BKPT; } iPtrmap = PTRMAP_PAGENO(pBt, key); - rc = sqlite3pager_get(pBt->pPager, iPtrmap, (void **)&pPtrmap); + rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); if( rc!=SQLITE_OK ){ return rc; } offset = PTRMAP_PTROFFSET(pBt, key); + pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){ TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent)); - rc = sqlite3pager_write(pPtrmap); + rc = sqlite3PagerWrite(pDbPage); if( rc==SQLITE_OK ){ pPtrmap[offset] = eType; put4byte(&pPtrmap[offset+1], parent); } } - sqlite3pager_unref(pPtrmap); + sqlite3PagerUnref(pDbPage); return rc; } @@ -862,23 +462,27 @@ static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){ ** An error code is returned if something goes wrong, otherwise SQLITE_OK. */ static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){ + DbPage *pDbPage; /* The pointer map page */ int iPtrmap; /* Pointer map page index */ u8 *pPtrmap; /* Pointer map page data */ int offset; /* Offset of entry in pointer map */ int rc; + assert( sqlite3_mutex_held(pBt->mutex) ); + iPtrmap = PTRMAP_PAGENO(pBt, key); - rc = sqlite3pager_get(pBt->pPager, iPtrmap, (void **)&pPtrmap); + rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); if( rc!=0 ){ return rc; } + pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); offset = PTRMAP_PTROFFSET(pBt, key); assert( pEType!=0 ); *pEType = pPtrmap[offset]; if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]); - sqlite3pager_unref(pPtrmap); + sqlite3PagerUnref(pDbPage); if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT; return SQLITE_OK; } @@ -892,19 +496,23 @@ static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){ ** ** This routine works only for pages that do not contain overflow cells. */ -static u8 *findCell(MemPage *pPage, int iCell){ - u8 *data = pPage->aData; +#define findCell(pPage, iCell) \ + ((pPage)->aData + get2byte(&(pPage)->aData[(pPage)->cellOffset+2*(iCell)])) +#ifdef SQLITE_TEST +u8 *sqlite3BtreeFindCell(MemPage *pPage, int iCell){ assert( iCell>=0 ); - assert( iCell<get2byte(&data[pPage->hdrOffset+3]) ); - return data + get2byte(&data[pPage->cellOffset+2*iCell]); + assert( iCell<get2byte(&pPage->aData[pPage->hdrOffset+3]) ); + return findCell(pPage, iCell); } +#endif /* -** This a more complex version of findCell() that works for +** This a more complex version of sqlite3BtreeFindCell() that works for ** pages that do contain overflow cells. See insert */ static u8 *findOverflowCell(MemPage *pPage, int iCell){ int i; + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); for(i=pPage->nOverflow-1; i>=0; i--){ int k; struct _OvflCell *pOvfl; @@ -922,11 +530,14 @@ static u8 *findOverflowCell(MemPage *pPage, int iCell){ /* ** Parse a cell content block and fill in the CellInfo structure. There -** are two versions of this function. parseCell() takes a cell index -** as the second argument and parseCellPtr() takes a pointer to the -** body of the cell as its second argument. +** are two versions of this function. sqlite3BtreeParseCell() takes a +** cell index as the second argument and sqlite3BtreeParseCellPtr() +** takes a pointer to the body of the cell as its second argument. +** +** Within this file, the parseCell() macro can be called instead of +** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster. */ -static void parseCellPtr( +void sqlite3BtreeParseCellPtr( MemPage *pPage, /* Page containing the cell */ u8 *pCell, /* Pointer to the cell text. */ CellInfo *pInfo /* Fill in this structure */ @@ -934,6 +545,8 @@ static void parseCellPtr( int n; /* Number bytes in cell content header */ u32 nPayload; /* Number of bytes of cell payload */ + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); + pInfo->pCell = pCell; assert( pPage->leaf==0 || pPage->leaf==1 ); n = pPage->childPtrSize; @@ -952,6 +565,7 @@ static void parseCellPtr( pInfo->nKey = x; nPayload += x; } + pInfo->nPayload = nPayload; pInfo->nHeader = n; if( nPayload<=pPage->maxLocal ){ /* This is the (easy) common case where the entire payload fits @@ -991,12 +605,14 @@ static void parseCellPtr( pInfo->nSize = pInfo->iOverflow + 4; } } -static void parseCell( +#define parseCell(pPage, iCell, pInfo) \ + sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo)) +void sqlite3BtreeParseCell( MemPage *pPage, /* Page containing the cell */ int iCell, /* The cell index. First cell is 0 */ CellInfo *pInfo /* Fill in this structure */ ){ - parseCellPtr(pPage, findCell(pPage, iCell), pInfo); + parseCell(pPage, iCell, pInfo); } /* @@ -1008,13 +624,13 @@ static void parseCell( #ifndef NDEBUG static int cellSize(MemPage *pPage, int iCell){ CellInfo info; - parseCell(pPage, iCell, &info); + sqlite3BtreeParseCell(pPage, iCell, &info); return info.nSize; } #endif static int cellSizePtr(MemPage *pPage, u8 *pCell){ CellInfo info; - parseCellPtr(pPage, pCell, &info); + sqlite3BtreeParseCellPtr(pPage, pCell, &info); return info.nSize; } @@ -1027,7 +643,8 @@ static int cellSizePtr(MemPage *pPage, u8 *pCell){ static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){ if( pCell ){ CellInfo info; - parseCellPtr(pPage, pCell, &info); + sqlite3BtreeParseCellPtr(pPage, pCell, &info); + assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){ Pgno ovfl = get4byte(&pCell[info.iOverflow]); return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno); @@ -1042,6 +659,7 @@ static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){ */ static int ptrmapPutOvfl(MemPage *pPage, int iCell){ u8 *pCell; + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); pCell = findOverflowCell(pPage, iCell); return ptrmapPutOvflPtr(pPage, pCell); } @@ -1049,100 +667,6 @@ static int ptrmapPutOvfl(MemPage *pPage, int iCell){ /* -** Do sanity checking on a page. Throw an exception if anything is -** not right. -** -** This routine is used for internal error checking only. It is omitted -** from most builds. -*/ -#if defined(BTREE_DEBUG) && !defined(NDEBUG) && 0 -static void _pageIntegrity(MemPage *pPage){ - int usableSize; - u8 *data; - int i, j, idx, c, pc, hdr, nFree; - int cellOffset; - int nCell, cellLimit; - u8 *used; - - used = sqliteMallocRaw( pPage->pBt->pageSize ); - if( used==0 ) return; - usableSize = pPage->pBt->usableSize; - assert( pPage->aData==&((unsigned char*)pPage)[-pPage->pBt->pageSize] ); - hdr = pPage->hdrOffset; - assert( hdr==(pPage->pgno==1 ? 100 : 0) ); - assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) ); - c = pPage->aData[hdr]; - if( pPage->isInit ){ - assert( pPage->leaf == ((c & PTF_LEAF)!=0) ); - assert( pPage->zeroData == ((c & PTF_ZERODATA)!=0) ); - assert( pPage->leafData == ((c & PTF_LEAFDATA)!=0) ); - assert( pPage->intKey == ((c & (PTF_INTKEY|PTF_LEAFDATA))!=0) ); - assert( pPage->hasData == - !(pPage->zeroData || (!pPage->leaf && pPage->leafData)) ); - assert( pPage->cellOffset==pPage->hdrOffset+12-4*pPage->leaf ); - assert( pPage->nCell = get2byte(&pPage->aData[hdr+3]) ); - } - data = pPage->aData; - memset(used, 0, usableSize); - for(i=0; i<hdr+10-pPage->leaf*4; i++) used[i] = 1; - nFree = 0; - pc = get2byte(&data[hdr+1]); - while( pc ){ - int size; - assert( pc>0 && pc<usableSize-4 ); - size = get2byte(&data[pc+2]); - assert( pc+size<=usableSize ); - nFree += size; - for(i=pc; i<pc+size; i++){ - assert( used[i]==0 ); - used[i] = 1; - } - pc = get2byte(&data[pc]); - } - idx = 0; - nCell = get2byte(&data[hdr+3]); - cellLimit = get2byte(&data[hdr+5]); - assert( pPage->isInit==0 - || pPage->nFree==nFree+data[hdr+7]+cellLimit-(cellOffset+2*nCell) ); - cellOffset = pPage->cellOffset; - for(i=0; i<nCell; i++){ - int size; - pc = get2byte(&data[cellOffset+2*i]); - assert( pc>0 && pc<usableSize-4 ); - size = cellSize(pPage, &data[pc]); - assert( pc+size<=usableSize ); - for(j=pc; j<pc+size; j++){ - assert( used[j]==0 ); - used[j] = 1; - } - } - for(i=cellOffset+2*nCell; i<cellimit; i++){ - assert( used[i]==0 ); - used[i] = 1; - } - nFree = 0; - for(i=0; i<usableSize; i++){ - assert( used[i]<=1 ); - if( used[i]==0 ) nFree++; - } - assert( nFree==data[hdr+7] ); - sqliteFree(used); -} -#define pageIntegrity(X) _pageIntegrity(X) -#else -# define pageIntegrity(X) -#endif - -/* A bunch of assert() statements to check the transaction state variables -** of handle p (type Btree*) are internally consistent. -*/ -#define btreeIntegrity(p) \ - assert( p->inTrans!=TRANS_NONE || p->pBt->nTransaction<p->pBt->nRef ); \ - assert( p->pBt->nTransaction<=p->pBt->nRef ); \ - assert( p->pBt->inTransaction!=TRANS_NONE || p->pBt->nTransaction==0 ); \ - assert( p->pBt->inTransaction>=p->inTrans ); - -/* ** Defragment the page given. All Cells are moved to the ** end of the page and all free space is collected into one ** big FreeBlk that occurs in between the header and cell @@ -1161,12 +685,12 @@ static int defragmentPage(MemPage *pPage){ unsigned char *data; /* The page data */ unsigned char *temp; /* Temp area for cell content */ - assert( sqlite3pager_iswriteable(pPage->aData) ); + assert( sqlite3PagerIswriteable(pPage->pDbPage) ); assert( pPage->pBt!=0 ); assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); assert( pPage->nOverflow==0 ); - temp = sqliteMalloc( pPage->pBt->pageSize ); - if( temp==0 ) return SQLITE_NOMEM; + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); + temp = sqlite3PagerTempSpace(pPage->pBt->pPager); data = pPage->aData; hdr = pPage->hdrOffset; cellOffset = pPage->cellOffset; @@ -1193,7 +717,6 @@ static int defragmentPage(MemPage *pPage){ data[hdr+7] = 0; addr = cellOffset+2*nCell; memset(&data[addr], 0, brk-addr); - sqliteFree(temp); return SQLITE_OK; } @@ -1219,8 +742,9 @@ static int allocateSpace(MemPage *pPage, int nByte){ unsigned char *data; data = pPage->aData; - assert( sqlite3pager_iswriteable(data) ); + assert( sqlite3PagerIswriteable(pPage->pDbPage) ); assert( pPage->pBt ); + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); if( nByte<4 ) nByte = 4; if( pPage->nFree<nByte || pPage->nOverflow>0 ) return 0; pPage->nFree -= nByte; @@ -1276,9 +800,10 @@ static void freeSpace(MemPage *pPage, int start, int size){ unsigned char *data = pPage->aData; assert( pPage->pBt!=0 ); - assert( sqlite3pager_iswriteable(data) ); + assert( sqlite3PagerIswriteable(pPage->pDbPage) ); assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) ); assert( (start + size)<=pPage->pBt->usableSize ); + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); if( size<4 ) size = 4; #ifdef SQLITE_SECURE_DELETE @@ -1339,6 +864,7 @@ static void decodeFlags(MemPage *pPage, int flagByte){ BtShared *pBt; /* A copy of pPage->pBt */ assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) ); + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); pPage->intKey = (flagByte & (PTF_INTKEY|PTF_LEAFDATA))!=0; pPage->zeroData = (flagByte & PTF_ZERODATA)!=0; pPage->leaf = (flagByte & PTF_LEAF)!=0; @@ -1369,7 +895,7 @@ static void decodeFlags(MemPage *pPage, int flagByte){ ** guarantee that the page is well-formed. It only shows that ** we failed to detect any corruption. */ -static int initPage( +int sqlite3BtreeInitPage( MemPage *pPage, /* The page to be initialized */ MemPage *pParent /* The parent. Might be NULL */ ){ @@ -1385,8 +911,10 @@ static int initPage( pBt = pPage->pBt; assert( pBt!=0 ); assert( pParent==0 || pParent->pBt==pBt ); - assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) ); - assert( pPage->aData == &((unsigned char*)pPage)[-pBt->pageSize] ); + assert( sqlite3_mutex_held(pBt->mutex) ); + assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); + assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); + assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); if( pPage->pParent!=pParent && (pPage->pParent!=0 || pPage->isInit) ){ /* The parent page should never change unless the file is corrupt */ return SQLITE_CORRUPT_BKPT; @@ -1394,7 +922,7 @@ static int initPage( if( pPage->isInit ) return SQLITE_OK; if( pPage->pParent==0 && pParent!=0 ){ pPage->pParent = pParent; - sqlite3pager_ref(pParent->aData); + sqlite3PagerRef(pParent->pDbPage); } hdr = pPage->hdrOffset; data = pPage->aData; @@ -1439,7 +967,6 @@ static int initPage( } pPage->isInit = 1; - pageIntegrity(pPage); return SQLITE_OK; } @@ -1453,9 +980,11 @@ static void zeroPage(MemPage *pPage, int flags){ int hdr = pPage->hdrOffset; int first; - assert( sqlite3pager_pagenumber(data)==pPage->pgno ); - assert( &data[pBt->pageSize] == (unsigned char*)pPage ); - assert( sqlite3pager_iswriteable(data) ); + assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno ); + assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); + assert( sqlite3PagerGetData(pPage->pDbPage) == data ); + assert( sqlite3PagerIswriteable(pPage->pDbPage) ); + assert( sqlite3_mutex_held(pBt->mutex) ); memset(&data[hdr], 0, pBt->usableSize - hdr); data[hdr] = flags; first = hdr + 8 + 4*((flags&PTF_LEAF)==0); @@ -1470,21 +999,35 @@ static void zeroPage(MemPage *pPage, int flags){ pPage->idxShift = 0; pPage->nCell = 0; pPage->isInit = 1; - pageIntegrity(pPage); } /* ** Get a page from the pager. Initialize the MemPage.pBt and ** MemPage.aData elements if needed. -*/ -static int getPage(BtShared *pBt, Pgno pgno, MemPage **ppPage){ +** +** If the noContent flag is set, it means that we do not care about +** the content of the page at this time. So do not go to the disk +** to fetch the content. Just fill in the content with zeros for now. +** If in the future we call sqlite3PagerWrite() on this page, that +** means we have started to be concerned about content and the disk +** read should occur at that point. +*/ +int sqlite3BtreeGetPage( + BtShared *pBt, /* The btree */ + Pgno pgno, /* Number of the page to fetch */ + MemPage **ppPage, /* Return the page in this parameter */ + int noContent /* Do not load page content if true */ +){ int rc; - unsigned char *aData; MemPage *pPage; - rc = sqlite3pager_get(pBt->pPager, pgno, (void**)&aData); + DbPage *pDbPage; + + assert( sqlite3_mutex_held(pBt->mutex) ); + rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent); if( rc ) return rc; - pPage = (MemPage*)&aData[pBt->pageSize]; - pPage->aData = aData; + pPage = (MemPage *)sqlite3PagerGetExtra(pDbPage); + pPage->aData = sqlite3PagerGetData(pDbPage); + pPage->pDbPage = pDbPage; pPage->pBt = pBt; pPage->pgno = pgno; pPage->hdrOffset = pPage->pgno==1 ? 100 : 0; @@ -1495,7 +1038,7 @@ static int getPage(BtShared *pBt, Pgno pgno, MemPage **ppPage){ /* ** Get a page from the pager and initialize it. This routine ** is just a convenience wrapper around separate calls to -** getPage() and initPage(). +** sqlite3BtreeGetPage() and sqlite3BtreeInitPage(). */ static int getAndInitPage( BtShared *pBt, /* The database file */ @@ -1504,26 +1047,29 @@ static int getAndInitPage( MemPage *pParent /* Parent of the page */ ){ int rc; + assert( sqlite3_mutex_held(pBt->mutex) ); if( pgno==0 ){ return SQLITE_CORRUPT_BKPT; } - rc = getPage(pBt, pgno, ppPage); + rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0); if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){ - rc = initPage(*ppPage, pParent); + rc = sqlite3BtreeInitPage(*ppPage, pParent); } return rc; } /* ** Release a MemPage. This should be called once for each prior -** call to getPage. +** call to sqlite3BtreeGetPage. */ static void releasePage(MemPage *pPage){ if( pPage ){ assert( pPage->aData ); assert( pPage->pBt ); - assert( &pPage->aData[pPage->pBt->pageSize]==(unsigned char*)pPage ); - sqlite3pager_unref(pPage->aData); + assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); + assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); + sqlite3PagerUnref(pPage->pDbPage); } } @@ -1532,12 +1078,14 @@ static void releasePage(MemPage *pPage){ ** reaches zero. We need to unref the pParent pointer when that ** happens. */ -static void pageDestructor(void *pData, int pageSize){ +static void pageDestructor(DbPage *pData, int pageSize){ MemPage *pPage; assert( (pageSize & 7)==0 ); - pPage = (MemPage*)&((char*)pData)[pageSize]; + pPage = (MemPage *)sqlite3PagerGetExtra(pData); + assert( pPage->isInit==0 || sqlite3_mutex_held(pPage->pBt->mutex) ); if( pPage->pParent ){ MemPage *pParent = pPage->pParent; + assert( pParent->pBt==pPage->pBt ); pPage->pParent = 0; releasePage(pParent); } @@ -1552,37 +1100,49 @@ static void pageDestructor(void *pData, int pageSize){ ** This routine needs to reset the extra data section at the end of the ** page to agree with the restored data. */ -static void pageReinit(void *pData, int pageSize){ +static void pageReinit(DbPage *pData, int pageSize){ MemPage *pPage; assert( (pageSize & 7)==0 ); - pPage = (MemPage*)&((char*)pData)[pageSize]; + pPage = (MemPage *)sqlite3PagerGetExtra(pData); if( pPage->isInit ){ + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); pPage->isInit = 0; - initPage(pPage, pPage->pParent); + sqlite3BtreeInitPage(pPage, pPage->pParent); } } /* +** Invoke the busy handler for a btree. +*/ +static int sqlite3BtreeInvokeBusyHandler(void *pArg, int n){ + BtShared *pBt = (BtShared*)pArg; + assert( pBt->db ); + assert( sqlite3_mutex_held(pBt->db->mutex) ); + return sqlite3InvokeBusyHandler(&pBt->db->busyHandler); +} + +/* ** Open a database file. ** ** zFilename is the name of the database file. If zFilename is NULL ** a new database with a random name is created. This randomly named ** database file will be deleted when sqlite3BtreeClose() is called. +** If zFilename is ":memory:" then an in-memory database is created +** that is automatically destroyed when it is closed. */ int sqlite3BtreeOpen( const char *zFilename, /* Name of the file containing the BTree database */ - sqlite3 *pSqlite, /* Associated database handle */ + sqlite3 *db, /* Associated database handle */ Btree **ppBtree, /* Pointer to new Btree object written here */ - int flags /* Options */ + int flags, /* Options */ + int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */ ){ - BtShared *pBt; /* Shared part of btree structure */ + sqlite3_vfs *pVfs; /* The VFS to use for this btree */ + BtShared *pBt = 0; /* Shared part of btree structure */ Btree *p; /* Handle to return */ - int rc; + int rc = SQLITE_OK; int nReserve; unsigned char zDbHeader[100]; -#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) - const ThreadData *pTsdro; -#endif /* Set the variable isMemdb to true for an in-memory database, or ** false for a file-based database. This symbol is only required if @@ -1591,122 +1151,249 @@ int sqlite3BtreeOpen( */ #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM) #ifdef SQLITE_OMIT_MEMORYDB - const int isMemdb = !zFilename; + const int isMemdb = 0; #else - const int isMemdb = !zFilename || (strcmp(zFilename, ":memory:")?0:1); + const int isMemdb = zFilename && !strcmp(zFilename, ":memory:"); #endif #endif - p = sqliteMalloc(sizeof(Btree)); + assert( db!=0 ); + assert( sqlite3_mutex_held(db->mutex) ); + + pVfs = db->pVfs; + p = sqlite3MallocZero(sizeof(Btree)); if( !p ){ return SQLITE_NOMEM; } p->inTrans = TRANS_NONE; - p->pSqlite = pSqlite; + p->db = db; - /* Try to find an existing Btree structure opened on zFilename. */ #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) - pTsdro = sqlite3ThreadDataReadOnly(); - if( pTsdro->useSharedData && zFilename && !isMemdb ){ - char *zFullPathname = sqlite3OsFullPathname(zFilename); - if( !zFullPathname ){ - sqliteFree(p); - return SQLITE_NOMEM; - } - for(pBt=pTsdro->pBtree; pBt; pBt=pBt->pNext){ - assert( pBt->nRef>0 ); - if( 0==strcmp(zFullPathname, sqlite3pager_filename(pBt->pPager)) ){ - p->pBt = pBt; - *ppBtree = p; - pBt->nRef++; - sqliteFree(zFullPathname); - return SQLITE_OK; + /* + ** If this Btree is a candidate for shared cache, try to find an + ** existing BtShared object that we can share with + */ + if( (flags & BTREE_PRIVATE)==0 + && isMemdb==0 + && (db->flags & SQLITE_Vtab)==0 + && zFilename && zFilename[0] + ){ + if( sqlite3SharedCacheEnabled ){ + int nFullPathname = pVfs->mxPathname+1; + char *zFullPathname = (char *)sqlite3_malloc(nFullPathname); + sqlite3_mutex *mutexShared; + p->sharable = 1; + if( db ){ + db->flags |= SQLITE_SharedCache; + } + if( !zFullPathname ){ + sqlite3_free(p); + return SQLITE_NOMEM; } + sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname); + mutexShared = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MASTER); + sqlite3_mutex_enter(mutexShared); + for(pBt=sqlite3SharedCacheList; pBt; pBt=pBt->pNext){ + assert( pBt->nRef>0 ); + if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager)) + && sqlite3PagerVfs(pBt->pPager)==pVfs ){ + p->pBt = pBt; + pBt->nRef++; + break; + } + } + sqlite3_mutex_leave(mutexShared); + sqlite3_free(zFullPathname); + } +#ifdef SQLITE_DEBUG + else{ + /* In debug mode, we mark all persistent databases as sharable + ** even when they are not. This exercises the locking code and + ** gives more opportunity for asserts(sqlite3_mutex_held()) + ** statements to find locking problems. + */ + p->sharable = 1; } - sqliteFree(zFullPathname); +#endif } #endif - - /* - ** The following asserts make sure that structures used by the btree are - ** the right size. This is to guard against size changes that result - ** when compiling on a different architecture. - */ - assert( sizeof(i64)==8 || sizeof(i64)==4 ); - assert( sizeof(u64)==8 || sizeof(u64)==4 ); - assert( sizeof(u32)==4 ); - assert( sizeof(u16)==2 ); - assert( sizeof(Pgno)==4 ); - - pBt = sqliteMalloc( sizeof(*pBt) ); if( pBt==0 ){ - *ppBtree = 0; - sqliteFree(p); - return SQLITE_NOMEM; - } - rc = sqlite3pager_open(&pBt->pPager, zFilename, EXTRA_SIZE, flags); - if( rc!=SQLITE_OK ){ - if( pBt->pPager ) sqlite3pager_close(pBt->pPager); - sqliteFree(pBt); - sqliteFree(p); - *ppBtree = 0; - return rc; - } - p->pBt = pBt; - - sqlite3pager_set_destructor(pBt->pPager, pageDestructor); - sqlite3pager_set_reiniter(pBt->pPager, pageReinit); - pBt->pCursor = 0; - pBt->pPage1 = 0; - pBt->readOnly = sqlite3pager_isreadonly(pBt->pPager); - sqlite3pager_read_fileheader(pBt->pPager, sizeof(zDbHeader), zDbHeader); - pBt->pageSize = get2byte(&zDbHeader[16]); - if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE - || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){ - pBt->pageSize = SQLITE_DEFAULT_PAGE_SIZE; - pBt->maxEmbedFrac = 64; /* 25% */ - pBt->minEmbedFrac = 32; /* 12.5% */ - pBt->minLeafFrac = 32; /* 12.5% */ -#ifndef SQLITE_OMIT_AUTOVACUUM - /* If the magic name ":memory:" will create an in-memory database, then - ** do not set the auto-vacuum flag, even if SQLITE_DEFAULT_AUTOVACUUM - ** is true. On the other hand, if SQLITE_OMIT_MEMORYDB has been defined, - ** then ":memory:" is just a regular file-name. Respect the auto-vacuum - ** default in this case. + /* + ** The following asserts make sure that structures used by the btree are + ** the right size. This is to guard against size changes that result + ** when compiling on a different architecture. */ - if( zFilename && !isMemdb ){ - pBt->autoVacuum = SQLITE_DEFAULT_AUTOVACUUM; + assert( sizeof(i64)==8 || sizeof(i64)==4 ); + assert( sizeof(u64)==8 || sizeof(u64)==4 ); + assert( sizeof(u32)==4 ); + assert( sizeof(u16)==2 ); + assert( sizeof(Pgno)==4 ); + + pBt = sqlite3MallocZero( sizeof(*pBt) ); + if( pBt==0 ){ + rc = SQLITE_NOMEM; + goto btree_open_out; } + pBt->busyHdr.xFunc = sqlite3BtreeInvokeBusyHandler; + pBt->busyHdr.pArg = pBt; + rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename, + EXTRA_SIZE, flags, vfsFlags); + if( rc==SQLITE_OK ){ + rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader); + } + if( rc!=SQLITE_OK ){ + goto btree_open_out; + } + sqlite3PagerSetBusyhandler(pBt->pPager, &pBt->busyHdr); + p->pBt = pBt; + + sqlite3PagerSetDestructor(pBt->pPager, pageDestructor); + sqlite3PagerSetReiniter(pBt->pPager, pageReinit); + pBt->pCursor = 0; + pBt->pPage1 = 0; + pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager); + pBt->pageSize = get2byte(&zDbHeader[16]); + if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE + || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){ + pBt->pageSize = 0; + sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); + pBt->maxEmbedFrac = 64; /* 25% */ + pBt->minEmbedFrac = 32; /* 12.5% */ + pBt->minLeafFrac = 32; /* 12.5% */ +#ifndef SQLITE_OMIT_AUTOVACUUM + /* If the magic name ":memory:" will create an in-memory database, then + ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if + ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if + ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a + ** regular file-name. In this case the auto-vacuum applies as per normal. + */ + if( zFilename && !isMemdb ){ + pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0); + pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0); + } #endif - nReserve = 0; - }else{ - nReserve = zDbHeader[20]; - pBt->maxEmbedFrac = zDbHeader[21]; - pBt->minEmbedFrac = zDbHeader[22]; - pBt->minLeafFrac = zDbHeader[23]; - pBt->pageSizeFixed = 1; + nReserve = 0; + }else{ + nReserve = zDbHeader[20]; + pBt->maxEmbedFrac = zDbHeader[21]; + pBt->minEmbedFrac = zDbHeader[22]; + pBt->minLeafFrac = zDbHeader[23]; + pBt->pageSizeFixed = 1; #ifndef SQLITE_OMIT_AUTOVACUUM - pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0); + pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0); + pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0); +#endif + } + pBt->usableSize = pBt->pageSize - nReserve; + assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */ + sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); + +#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) + /* Add the new BtShared object to the linked list sharable BtShareds. + */ + if( p->sharable ){ + sqlite3_mutex *mutexShared; + pBt->nRef = 1; + mutexShared = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MASTER); + if( SQLITE_THREADSAFE ){ + pBt->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_FAST); + if( pBt->mutex==0 ){ + rc = SQLITE_NOMEM; + db->mallocFailed = 0; + goto btree_open_out; + } + } + sqlite3_mutex_enter(mutexShared); + pBt->pNext = sqlite3SharedCacheList; + sqlite3SharedCacheList = pBt; + sqlite3_mutex_leave(mutexShared); + } #endif } - pBt->usableSize = pBt->pageSize - nReserve; - assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */ - sqlite3pager_set_pagesize(pBt->pPager, pBt->pageSize); #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) - /* Add the new btree to the linked list starting at ThreadData.pBtree. - ** There is no chance that a malloc() may fail inside of the - ** sqlite3ThreadData() call, as the ThreadData structure must have already - ** been allocated for pTsdro->useSharedData to be non-zero. + /* If the new Btree uses a sharable pBtShared, then link the new + ** Btree into the list of all sharable Btrees for the same connection. + ** The list is kept in ascending order by pBt address. */ - if( pTsdro->useSharedData && zFilename && !isMemdb ){ - pBt->pNext = pTsdro->pBtree; - sqlite3ThreadData()->pBtree = pBt; + if( p->sharable ){ + int i; + Btree *pSib; + for(i=0; i<db->nDb; i++){ + if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){ + while( pSib->pPrev ){ pSib = pSib->pPrev; } + if( p->pBt<pSib->pBt ){ + p->pNext = pSib; + p->pPrev = 0; + pSib->pPrev = p; + }else{ + while( pSib->pNext && pSib->pNext->pBt<p->pBt ){ + pSib = pSib->pNext; + } + p->pNext = pSib->pNext; + p->pPrev = pSib; + if( p->pNext ){ + p->pNext->pPrev = p; + } + pSib->pNext = p; + } + break; + } + } } #endif - pBt->nRef = 1; *ppBtree = p; - return SQLITE_OK; + +btree_open_out: + if( rc!=SQLITE_OK ){ + if( pBt && pBt->pPager ){ + sqlite3PagerClose(pBt->pPager); + } + sqlite3_free(pBt); + sqlite3_free(p); + *ppBtree = 0; + } + return rc; +} + +/* +** Decrement the BtShared.nRef counter. When it reaches zero, +** remove the BtShared structure from the sharing list. Return +** true if the BtShared.nRef counter reaches zero and return +** false if it is still positive. +*/ +static int removeFromSharingList(BtShared *pBt){ +#ifndef SQLITE_OMIT_SHARED_CACHE + sqlite3_mutex *pMaster; + BtShared *pList; + int removed = 0; + + assert( sqlite3_mutex_notheld(pBt->mutex) ); + pMaster = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MASTER); + sqlite3_mutex_enter(pMaster); + pBt->nRef--; + if( pBt->nRef<=0 ){ + if( sqlite3SharedCacheList==pBt ){ + sqlite3SharedCacheList = pBt->pNext; + }else{ + pList = sqlite3SharedCacheList; + while( pList && pList->pNext!=pBt ){ + pList=pList->pNext; + } + if( pList ){ + pList->pNext = pBt->pNext; + } + } + if( SQLITE_THREADSAFE ){ + sqlite3_mutex_free(pBt->mutex); + } + removed = 1; + } + sqlite3_mutex_leave(pMaster); + return removed; +#else + return 1; +#endif } /* @@ -1716,11 +1403,10 @@ int sqlite3BtreeClose(Btree *p){ BtShared *pBt = p->pBt; BtCursor *pCur; -#ifndef SQLITE_OMIT_SHARED_CACHE - ThreadData *pTsd; -#endif - /* Close all cursors opened via this handle. */ + assert( sqlite3_mutex_held(p->db->mutex) ); + sqlite3BtreeEnter(p); + pBt->db = p->db; pCur = pBt->pCursor; while( pCur ){ BtCursor *pTmp = pCur; @@ -1735,55 +1421,36 @@ int sqlite3BtreeClose(Btree *p){ ** this handle. */ sqlite3BtreeRollback(p); - sqliteFree(p); + sqlite3BtreeLeave(p); -#ifndef SQLITE_OMIT_SHARED_CACHE /* If there are still other outstanding references to the shared-btree ** structure, return now. The remainder of this procedure cleans ** up the shared-btree. */ - assert( pBt->nRef>0 ); - pBt->nRef--; - if( pBt->nRef ){ - return SQLITE_OK; - } - - /* Remove the shared-btree from the thread wide list. Call - ** ThreadDataReadOnly() and then cast away the const property of the - ** pointer to avoid allocating thread data if it is not really required. - */ - pTsd = (ThreadData *)sqlite3ThreadDataReadOnly(); - if( pTsd->pBtree==pBt ){ - assert( pTsd==sqlite3ThreadData() ); - pTsd->pBtree = pBt->pNext; - }else{ - BtShared *pPrev; - for(pPrev=pTsd->pBtree; pPrev && pPrev->pNext!=pBt; pPrev=pPrev->pNext){} - if( pPrev ){ - assert( pTsd==sqlite3ThreadData() ); - pPrev->pNext = pBt->pNext; + assert( p->wantToLock==0 && p->locked==0 ); + if( !p->sharable || removeFromSharingList(pBt) ){ + /* The pBt is no longer on the sharing list, so we can access + ** it without having to hold the mutex. + ** + ** Clean out and delete the BtShared object. + */ + assert( !pBt->pCursor ); + sqlite3PagerClose(pBt->pPager); + if( pBt->xFreeSchema && pBt->pSchema ){ + pBt->xFreeSchema(pBt->pSchema); } + sqlite3_free(pBt->pSchema); + sqlite3_free(pBt); } -#endif - /* Close the pager and free the shared-btree structure */ - assert( !pBt->pCursor ); - sqlite3pager_close(pBt->pPager); - if( pBt->xFreeSchema && pBt->pSchema ){ - pBt->xFreeSchema(pBt->pSchema); - } - sqliteFree(pBt->pSchema); - sqliteFree(pBt); - return SQLITE_OK; -} +#ifndef SQLITE_OMIT_SHARED_CACHE + assert( p->wantToLock==0 ); + assert( p->locked==0 ); + if( p->pPrev ) p->pPrev->pNext = p->pNext; + if( p->pNext ) p->pNext->pPrev = p->pPrev; +#endif -/* -** Change the busy handler callback function. -*/ -int sqlite3BtreeSetBusyHandler(Btree *p, BusyHandler *pHandler){ - BtShared *pBt = p->pBt; - pBt->pBusyHandler = pHandler; - sqlite3pager_set_busyhandler(pBt->pPager, pHandler); + sqlite3_free(p); return SQLITE_OK; } @@ -1804,7 +1471,10 @@ int sqlite3BtreeSetBusyHandler(Btree *p, BusyHandler *pHandler){ */ int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){ BtShared *pBt = p->pBt; - sqlite3pager_set_cachesize(pBt->pPager, mxPage); + assert( sqlite3_mutex_held(p->db->mutex) ); + sqlite3BtreeEnter(p); + sqlite3PagerSetCachesize(pBt->pPager, mxPage); + sqlite3BtreeLeave(p); return SQLITE_OK; } @@ -1819,7 +1489,10 @@ int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){ #ifndef SQLITE_OMIT_PAGER_PRAGMAS int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){ BtShared *pBt = p->pBt; - sqlite3pager_set_safety_level(pBt->pPager, level, fullSync); + assert( sqlite3_mutex_held(p->db->mutex) ); + sqlite3BtreeEnter(p); + sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync); + sqlite3BtreeLeave(p); return SQLITE_OK; } #endif @@ -1830,8 +1503,13 @@ int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){ */ int sqlite3BtreeSyncDisabled(Btree *p){ BtShared *pBt = p->pBt; + int rc; + assert( sqlite3_mutex_held(p->db->mutex) ); + sqlite3BtreeEnter(p); assert( pBt && pBt->pPager ); - return sqlite3pager_nosync(pBt->pPager); + rc = sqlite3PagerNosync(pBt->pPager); + sqlite3BtreeLeave(p); + return rc; } #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) @@ -1851,8 +1529,11 @@ int sqlite3BtreeSyncDisabled(Btree *p){ ** bytes per page is left unchanged. */ int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){ + int rc = SQLITE_OK; BtShared *pBt = p->pBt; + sqlite3BtreeEnter(p); if( pBt->pageSizeFixed ){ + sqlite3BtreeLeave(p); return SQLITE_READONLY; } if( nReserve<0 ){ @@ -1862,10 +1543,12 @@ int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){ ((pageSize-1)&pageSize)==0 ){ assert( (pageSize & 7)==0 ); assert( !pBt->pPage1 && !pBt->pCursor ); - pBt->pageSize = sqlite3pager_set_pagesize(pBt->pPager, pageSize); + pBt->pageSize = pageSize; + rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); } pBt->usableSize = pBt->pageSize - nReserve; - return SQLITE_OK; + sqlite3BtreeLeave(p); + return rc; } /* @@ -1875,7 +1558,24 @@ int sqlite3BtreeGetPageSize(Btree *p){ return p->pBt->pageSize; } int sqlite3BtreeGetReserve(Btree *p){ - return p->pBt->pageSize - p->pBt->usableSize; + int n; + sqlite3BtreeEnter(p); + n = p->pBt->pageSize - p->pBt->usableSize; + sqlite3BtreeLeave(p); + return n; +} + +/* +** Set the maximum page count for a database if mxPage is positive. +** No changes are made if mxPage is 0 or negative. +** Regardless of the value of mxPage, return the maximum page count. +*/ +int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){ + int n; + sqlite3BtreeEnter(p); + n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage); + sqlite3BtreeLeave(p); + return n; } #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */ @@ -1886,15 +1586,21 @@ int sqlite3BtreeGetReserve(Btree *p){ ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro. */ int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){ - BtShared *pBt = p->pBt;; #ifdef SQLITE_OMIT_AUTOVACUUM return SQLITE_READONLY; #else - if( pBt->pageSizeFixed ){ - return SQLITE_READONLY; + BtShared *pBt = p->pBt; + int rc = SQLITE_OK; + int av = (autoVacuum?1:0); + + sqlite3BtreeEnter(p); + if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){ + rc = SQLITE_READONLY; + }else{ + pBt->autoVacuum = av; } - pBt->autoVacuum = (autoVacuum?1:0); - return SQLITE_OK; + sqlite3BtreeLeave(p); + return rc; #endif } @@ -1904,9 +1610,17 @@ int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){ */ int sqlite3BtreeGetAutoVacuum(Btree *p){ #ifdef SQLITE_OMIT_AUTOVACUUM - return 0; + return BTREE_AUTOVACUUM_NONE; #else - return p->pBt->autoVacuum; + int rc; + sqlite3BtreeEnter(p); + rc = ( + (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE: + (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL: + BTREE_AUTOVACUUM_INCR + ); + sqlite3BtreeLeave(p); + return rc; #endif } @@ -1918,14 +1632,15 @@ int sqlite3BtreeGetAutoVacuum(Btree *p){ ** SQLITE_OK is returned on success. If the file is not a ** well-formed database file, then SQLITE_CORRUPT is returned. ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM -** is returned if we run out of memory. SQLITE_PROTOCOL is returned -** if there is a locking protocol violation. +** is returned if we run out of memory. */ static int lockBtree(BtShared *pBt){ int rc, pageSize; MemPage *pPage1; + + assert( sqlite3_mutex_held(pBt->mutex) ); if( pBt->pPage1 ) return SQLITE_OK; - rc = getPage(pBt, 1, &pPage1); + rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0); if( rc!=SQLITE_OK ) return rc; @@ -1933,16 +1648,21 @@ static int lockBtree(BtShared *pBt){ ** a valid database file. */ rc = SQLITE_NOTADB; - if( sqlite3pager_pagecount(pBt->pPager)>0 ){ + if( sqlite3PagerPagecount(pBt->pPager)>0 ){ u8 *page1 = pPage1->aData; if( memcmp(page1, zMagicHeader, 16)!=0 ){ goto page1_init_failed; } - if( page1[18]>1 || page1[19]>1 ){ + if( page1[18]>1 ){ + pBt->readOnly = 1; + } + if( page1[19]>1 ){ goto page1_init_failed; } pageSize = get2byte(&page1[16]); - if( ((pageSize-1)&pageSize)!=0 ){ + if( ((pageSize-1)&pageSize)!=0 || pageSize<512 || + (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE) + ){ goto page1_init_failed; } assert( (pageSize & 7)==0 ); @@ -1956,6 +1676,7 @@ static int lockBtree(BtShared *pBt){ pBt->minLeafFrac = page1[23]; #ifndef SQLITE_OMIT_AUTOVACUUM pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0); + pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0); #endif } @@ -1995,6 +1716,8 @@ page1_init_failed: */ static int lockBtreeWithRetry(Btree *pRef){ int rc = SQLITE_OK; + + assert( sqlite3BtreeHoldsMutex(pRef) ); if( pRef->inTrans==TRANS_NONE ){ u8 inTransaction = pRef->pBt->inTransaction; btreeIntegrity(pRef); @@ -2021,14 +1744,20 @@ static int lockBtreeWithRetry(Btree *pRef){ ** If there is a transaction in progress, this routine is a no-op. */ static void unlockBtreeIfUnused(BtShared *pBt){ + assert( sqlite3_mutex_held(pBt->mutex) ); if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){ - if( pBt->pPage1->aData==0 ){ - MemPage *pPage = pBt->pPage1; - pPage->aData = &((u8*)pPage)[-pBt->pageSize]; - pPage->pBt = pBt; - pPage->pgno = 1; + if( sqlite3PagerRefcount(pBt->pPager)>=1 ){ + assert( pBt->pPage1->aData ); +#if 0 + if( pBt->pPage1->aData==0 ){ + MemPage *pPage = pBt->pPage1; + pPage->aData = sqlite3PagerGetData(pPage->pDbPage); + pPage->pBt = pBt; + pPage->pgno = 1; + } +#endif + releasePage(pBt->pPage1); } - releasePage(pBt->pPage1); pBt->pPage1 = 0; pBt->inStmt = 0; } @@ -2042,11 +1771,13 @@ static int newDatabase(BtShared *pBt){ MemPage *pP1; unsigned char *data; int rc; - if( sqlite3pager_pagecount(pBt->pPager)>0 ) return SQLITE_OK; + + assert( sqlite3_mutex_held(pBt->mutex) ); + if( sqlite3PagerPagecount(pBt->pPager)>0 ) return SQLITE_OK; pP1 = pBt->pPage1; assert( pP1!=0 ); data = pP1->aData; - rc = sqlite3pager_write(data); + rc = sqlite3PagerWrite(pP1->pDbPage); if( rc ) return rc; memcpy(data, zMagicHeader, sizeof(zMagicHeader)); assert( sizeof(zMagicHeader)==16 ); @@ -2061,9 +1792,10 @@ static int newDatabase(BtShared *pBt){ zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA ); pBt->pageSizeFixed = 1; #ifndef SQLITE_OMIT_AUTOVACUUM - if( pBt->autoVacuum ){ - put4byte(&data[36 + 4*4], 1); - } + assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 ); + assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 ); + put4byte(&data[36 + 4*4], pBt->autoVacuum); + put4byte(&data[36 + 7*4], pBt->incrVacuum); #endif return SQLITE_OK; } @@ -2107,6 +1839,8 @@ int sqlite3BtreeBeginTrans(Btree *p, int wrflag){ BtShared *pBt = p->pBt; int rc = SQLITE_OK; + sqlite3BtreeEnter(p); + pBt->db = p->db; btreeIntegrity(p); /* If the btree is already in a write-transaction, or it @@ -2114,12 +1848,13 @@ int sqlite3BtreeBeginTrans(Btree *p, int wrflag){ ** is requested, this is a no-op. */ if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){ - return SQLITE_OK; + goto trans_begun; } /* Write transactions are not possible on a read-only database */ if( pBt->readOnly && wrflag ){ - return SQLITE_READONLY; + rc = SQLITE_READONLY; + goto trans_begun; } /* If another database handle has already opened a write transaction @@ -2127,18 +1862,35 @@ int sqlite3BtreeBeginTrans(Btree *p, int wrflag){ ** requested, return SQLITE_BUSY. */ if( pBt->inTransaction==TRANS_WRITE && wrflag ){ - return SQLITE_BUSY; + rc = SQLITE_BUSY; + goto trans_begun; + } + +#ifndef SQLITE_OMIT_SHARED_CACHE + if( wrflag>1 ){ + BtLock *pIter; + for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ + if( pIter->pBtree!=p ){ + rc = SQLITE_BUSY; + goto trans_begun; + } + } } +#endif do { if( pBt->pPage1==0 ){ rc = lockBtree(pBt); } - + if( rc==SQLITE_OK && wrflag ){ - rc = sqlite3pager_begin(pBt->pPage1->aData, wrflag>1); - if( rc==SQLITE_OK ){ - rc = newDatabase(pBt); + if( pBt->readOnly ){ + rc = SQLITE_READONLY; + }else{ + rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1); + if( rc==SQLITE_OK ){ + rc = newDatabase(pBt); + } } } @@ -2148,7 +1900,7 @@ int sqlite3BtreeBeginTrans(Btree *p, int wrflag){ unlockBtreeIfUnused(pBt); } }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE && - sqlite3InvokeBusyHandler(pBt->pBusyHandler) ); + sqlite3BtreeInvokeBusyHandler(pBt, 0) ); if( rc==SQLITE_OK ){ if( p->inTrans==TRANS_NONE ){ @@ -2158,9 +1910,18 @@ int sqlite3BtreeBeginTrans(Btree *p, int wrflag){ if( p->inTrans>pBt->inTransaction ){ pBt->inTransaction = p->inTrans; } +#ifndef SQLITE_OMIT_SHARED_CACHE + if( wrflag>1 ){ + assert( !pBt->pExclusive ); + pBt->pExclusive = p; + } +#endif } + +trans_begun: btreeIntegrity(p); + sqlite3BtreeLeave(p); return rc; } @@ -2174,12 +1935,16 @@ int sqlite3BtreeBeginTrans(Btree *p, int wrflag){ static int setChildPtrmaps(MemPage *pPage){ int i; /* Counter variable */ int nCell; /* Number of cells in page pPage */ - int rc = SQLITE_OK; /* Return code */ + int rc; /* Return code */ BtShared *pBt = pPage->pBt; int isInitOrig = pPage->isInit; Pgno pgno = pPage->pgno; - initPage(pPage, 0); + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); + rc = sqlite3BtreeInitPage(pPage, pPage->pParent); + if( rc!=SQLITE_OK ){ + goto set_child_ptrmaps_out; + } nCell = pPage->nCell; for(i=0; i<nCell; i++){ @@ -2223,6 +1988,7 @@ set_child_ptrmaps_out: ** overflow page in the list. */ static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); if( eType==PTRMAP_OVERFLOW2 ){ /* The pointer is always the first 4 bytes of the page in this case. */ if( get4byte(pPage->aData)!=iFrom ){ @@ -2234,14 +2000,14 @@ static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ int i; int nCell; - initPage(pPage, 0); + sqlite3BtreeInitPage(pPage, 0); nCell = pPage->nCell; for(i=0; i<nCell; i++){ u8 *pCell = findCell(pPage, i); if( eType==PTRMAP_OVERFLOW1 ){ CellInfo info; - parseCellPtr(pPage, pCell, &info); + sqlite3BtreeParseCellPtr(pPage, pCell, &info); if( info.iOverflow ){ if( iFrom==get4byte(&pCell[info.iOverflow]) ){ put4byte(&pCell[info.iOverflow], iTo); @@ -2288,11 +2054,13 @@ static int relocatePage( assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ); + assert( sqlite3_mutex_held(pBt->mutex) ); + assert( pDbPage->pBt==pBt ); - /* Move page iDbPage from it's current location to page number iFreePage */ + /* Move page iDbPage from its current location to page number iFreePage */ TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", iDbPage, iFreePage, iPtrPage, eType)); - rc = sqlite3pager_movepage(pPager, pDbPage->aData, iFreePage); + rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage); if( rc!=SQLITE_OK ){ return rc; } @@ -2326,11 +2094,11 @@ static int relocatePage( ** iPtrPage. */ if( eType!=PTRMAP_ROOTPAGE ){ - rc = getPage(pBt, iPtrPage, &pPtrPage); + rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0); if( rc!=SQLITE_OK ){ return rc; } - rc = sqlite3pager_write(pPtrPage->aData); + rc = sqlite3PagerWrite(pPtrPage->pDbPage); if( rc!=SQLITE_OK ){ releasePage(pPtrPage); return rc; @@ -2344,154 +2112,278 @@ static int relocatePage( return rc; } -/* Forward declaration required by autoVacuumCommit(). */ -static int allocatePage(BtShared *, MemPage **, Pgno *, Pgno, u8); +/* Forward declaration required by incrVacuumStep(). */ +static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8); /* -** This routine is called prior to sqlite3pager_commit when a transaction -** is commited for an auto-vacuum database. +** Perform a single step of an incremental-vacuum. If successful, +** return SQLITE_OK. If there is no work to do (and therefore no +** point in calling this function again), return SQLITE_DONE. +** +** More specificly, this function attempts to re-organize the +** database so that the last page of the file currently in use +** is no longer in use. +** +** If the nFin parameter is non-zero, the implementation assumes +** that the caller will keep calling incrVacuumStep() until +** it returns SQLITE_DONE or an error, and that nFin is the +** number of pages the database file will contain after this +** process is complete. */ -static int autoVacuumCommit(BtShared *pBt, Pgno *nTrunc){ - Pager *pPager = pBt->pPager; - Pgno nFreeList; /* Number of pages remaining on the free-list. */ - int nPtrMap; /* Number of pointer-map pages deallocated */ - Pgno origSize; /* Pages in the database file */ - Pgno finSize; /* Pages in the database file after truncation */ - int rc; /* Return code */ - u8 eType; - int pgsz = pBt->pageSize; /* Page size for this database */ - Pgno iDbPage; /* The database page to move */ - MemPage *pDbMemPage = 0; /* "" */ - Pgno iPtrPage; /* The page that contains a pointer to iDbPage */ - Pgno iFreePage; /* The free-list page to move iDbPage to */ - MemPage *pFreeMemPage = 0; /* "" */ - -#ifndef NDEBUG - int nRef = sqlite3pager_refcount(pPager); -#endif +static int incrVacuumStep(BtShared *pBt, Pgno nFin){ + Pgno iLastPg; /* Last page in the database */ + Pgno nFreeList; /* Number of pages still on the free-list */ - assert( pBt->autoVacuum ); - if( PTRMAP_ISPAGE(pBt, sqlite3pager_pagecount(pPager)) ){ - return SQLITE_CORRUPT_BKPT; + assert( sqlite3_mutex_held(pBt->mutex) ); + iLastPg = pBt->nTrunc; + if( iLastPg==0 ){ + iLastPg = sqlite3PagerPagecount(pBt->pPager); } - /* Figure out how many free-pages are in the database. If there are no - ** free pages, then auto-vacuum is a no-op. - */ - nFreeList = get4byte(&pBt->pPage1->aData[36]); - if( nFreeList==0 ){ - *nTrunc = 0; - return SQLITE_OK; - } + if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){ + int rc; + u8 eType; + Pgno iPtrPage; - /* This block figures out how many pages there are in the database - ** now (variable origSize), and how many there will be after the - ** truncation (variable finSize). - ** - ** The final size is the original size, less the number of free pages - ** in the database, less any pointer-map pages that will no longer - ** be required, less 1 if the pending-byte page was part of the database - ** but is not after the truncation. - **/ - origSize = sqlite3pager_pagecount(pPager); - if( origSize==PENDING_BYTE_PAGE(pBt) ){ - origSize--; - } - nPtrMap = (nFreeList-origSize+PTRMAP_PAGENO(pBt, origSize)+pgsz/5)/(pgsz/5); - finSize = origSize - nFreeList - nPtrMap; - if( origSize>PENDING_BYTE_PAGE(pBt) && finSize<=PENDING_BYTE_PAGE(pBt) ){ - finSize--; - } - while( PTRMAP_ISPAGE(pBt, finSize) || finSize==PENDING_BYTE_PAGE(pBt) ){ - finSize--; - } - TRACE(("AUTOVACUUM: Begin (db size %d->%d)\n", origSize, finSize)); - - /* Variable 'finSize' will be the size of the file in pages after - ** the auto-vacuum has completed (the current file size minus the number - ** of pages on the free list). Loop through the pages that lie beyond - ** this mark, and if they are not already on the free list, move them - ** to a free page earlier in the file (somewhere before finSize). - */ - for( iDbPage=finSize+1; iDbPage<=origSize; iDbPage++ ){ - /* If iDbPage is a pointer map page, or the pending-byte page, skip it. */ - if( PTRMAP_ISPAGE(pBt, iDbPage) || iDbPage==PENDING_BYTE_PAGE(pBt) ){ - continue; + nFreeList = get4byte(&pBt->pPage1->aData[36]); + if( nFreeList==0 || nFin==iLastPg ){ + return SQLITE_DONE; } - rc = ptrmapGet(pBt, iDbPage, &eType, &iPtrPage); - if( rc!=SQLITE_OK ) goto autovacuum_out; + rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage); + if( rc!=SQLITE_OK ){ + return rc; + } if( eType==PTRMAP_ROOTPAGE ){ - rc = SQLITE_CORRUPT_BKPT; - goto autovacuum_out; + return SQLITE_CORRUPT_BKPT; } - /* If iDbPage is free, do not swap it. */ if( eType==PTRMAP_FREEPAGE ){ - continue; - } - rc = getPage(pBt, iDbPage, &pDbMemPage); - if( rc!=SQLITE_OK ) goto autovacuum_out; + if( nFin==0 ){ + /* Remove the page from the files free-list. This is not required + ** if nFin is non-zero. In that case, the free-list will be + ** truncated to zero after this function returns, so it doesn't + ** matter if it still contains some garbage entries. + */ + Pgno iFreePg; + MemPage *pFreePg; + rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1); + if( rc!=SQLITE_OK ){ + return rc; + } + assert( iFreePg==iLastPg ); + releasePage(pFreePg); + } + } else { + Pgno iFreePg; /* Index of free page to move pLastPg to */ + MemPage *pLastPg; - /* Find the next page in the free-list that is not already at the end - ** of the file. A page can be pulled off the free list using the - ** allocatePage() routine. - */ - do{ - if( pFreeMemPage ){ - releasePage(pFreeMemPage); - pFreeMemPage = 0; + rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0); + if( rc!=SQLITE_OK ){ + return rc; + } + + /* If nFin is zero, this loop runs exactly once and page pLastPg + ** is swapped with the first free page pulled off the free list. + ** + ** On the other hand, if nFin is greater than zero, then keep + ** looping until a free-page located within the first nFin pages + ** of the file is found. + */ + do { + MemPage *pFreePg; + rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0); + if( rc!=SQLITE_OK ){ + releasePage(pLastPg); + return rc; + } + releasePage(pFreePg); + }while( nFin!=0 && iFreePg>nFin ); + assert( iFreePg<iLastPg ); + + rc = sqlite3PagerWrite(pLastPg->pDbPage); + if( rc==SQLITE_OK ){ + rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg); } - rc = allocatePage(pBt, &pFreeMemPage, &iFreePage, 0, 0); + releasePage(pLastPg); if( rc!=SQLITE_OK ){ - releasePage(pDbMemPage); - goto autovacuum_out; + return rc; } - assert( iFreePage<=origSize ); - }while( iFreePage>finSize ); - releasePage(pFreeMemPage); - pFreeMemPage = 0; - - /* Relocate the page into the body of the file. Note that although the - ** page has moved within the database file, the pDbMemPage pointer - ** remains valid. This means that this function can run without - ** invalidating cursors open on the btree. This is important in - ** shared-cache mode. - */ - rc = relocatePage(pBt, pDbMemPage, eType, iPtrPage, iFreePage); - releasePage(pDbMemPage); - if( rc!=SQLITE_OK ) goto autovacuum_out; + } } - /* The entire free-list has been swapped to the end of the file. So - ** truncate the database file to finSize pages and consider the - ** free-list empty. - */ - rc = sqlite3pager_write(pBt->pPage1->aData); - if( rc!=SQLITE_OK ) goto autovacuum_out; - put4byte(&pBt->pPage1->aData[32], 0); - put4byte(&pBt->pPage1->aData[36], 0); - *nTrunc = finSize; - assert( finSize!=PENDING_BYTE_PAGE(pBt) ); - -autovacuum_out: - assert( nRef==sqlite3pager_refcount(pPager) ); - if( rc!=SQLITE_OK ){ - sqlite3pager_rollback(pPager); + pBt->nTrunc = iLastPg - 1; + while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){ + pBt->nTrunc--; + } + return SQLITE_OK; +} + +/* +** A write-transaction must be opened before calling this function. +** It performs a single unit of work towards an incremental vacuum. +** +** If the incremental vacuum is finished after this function has run, +** SQLITE_DONE is returned. If it is not finished, but no error occured, +** SQLITE_OK is returned. Otherwise an SQLite error code. +*/ +int sqlite3BtreeIncrVacuum(Btree *p){ + int rc; + BtShared *pBt = p->pBt; + + sqlite3BtreeEnter(p); + pBt->db = p->db; + assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE ); + if( !pBt->autoVacuum ){ + rc = SQLITE_DONE; + }else{ + invalidateAllOverflowCache(pBt); + rc = incrVacuumStep(pBt, 0); + } + sqlite3BtreeLeave(p); + return rc; +} + +/* +** This routine is called prior to sqlite3PagerCommit when a transaction +** is commited for an auto-vacuum database. +** +** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages +** the database file should be truncated to during the commit process. +** i.e. the database has been reorganized so that only the first *pnTrunc +** pages are in use. +*/ +static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){ + int rc = SQLITE_OK; + Pager *pPager = pBt->pPager; +#ifndef NDEBUG + int nRef = sqlite3PagerRefcount(pPager); +#endif + + assert( sqlite3_mutex_held(pBt->mutex) ); + invalidateAllOverflowCache(pBt); + assert(pBt->autoVacuum); + if( !pBt->incrVacuum ){ + Pgno nFin = 0; + + if( pBt->nTrunc==0 ){ + Pgno nFree; + Pgno nPtrmap; + const int pgsz = pBt->pageSize; + Pgno nOrig = sqlite3PagerPagecount(pBt->pPager); + + if( PTRMAP_ISPAGE(pBt, nOrig) ){ + return SQLITE_CORRUPT_BKPT; + } + if( nOrig==PENDING_BYTE_PAGE(pBt) ){ + nOrig--; + } + nFree = get4byte(&pBt->pPage1->aData[36]); + nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5); + nFin = nOrig - nFree - nPtrmap; + if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){ + nFin--; + } + while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){ + nFin--; + } + } + + while( rc==SQLITE_OK ){ + rc = incrVacuumStep(pBt, nFin); + } + if( rc==SQLITE_DONE ){ + assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc); + rc = SQLITE_OK; + if( pBt->nTrunc ){ + rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); + put4byte(&pBt->pPage1->aData[32], 0); + put4byte(&pBt->pPage1->aData[36], 0); + pBt->nTrunc = nFin; + } + } + if( rc!=SQLITE_OK ){ + sqlite3PagerRollback(pPager); + } + } + + if( rc==SQLITE_OK ){ + *pnTrunc = pBt->nTrunc; + pBt->nTrunc = 0; } + assert( nRef==sqlite3PagerRefcount(pPager) ); return rc; } + #endif /* +** This routine does the first phase of a two-phase commit. This routine +** causes a rollback journal to be created (if it does not already exist) +** and populated with enough information so that if a power loss occurs +** the database can be restored to its original state by playing back +** the journal. Then the contents of the journal are flushed out to +** the disk. After the journal is safely on oxide, the changes to the +** database are written into the database file and flushed to oxide. +** At the end of this call, the rollback journal still exists on the +** disk and we are still holding all locks, so the transaction has not +** committed. See sqlite3BtreeCommit() for the second phase of the +** commit process. +** +** This call is a no-op if no write-transaction is currently active on pBt. +** +** Otherwise, sync the database file for the btree pBt. zMaster points to +** the name of a master journal file that should be written into the +** individual journal file, or is NULL, indicating no master journal file +** (single database transaction). +** +** When this is called, the master journal should already have been +** created, populated with this journal pointer and synced to disk. +** +** Once this is routine has returned, the only thing required to commit +** the write-transaction for this database file is to delete the journal. +*/ +int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){ + int rc = SQLITE_OK; + if( p->inTrans==TRANS_WRITE ){ + BtShared *pBt = p->pBt; + Pgno nTrunc = 0; + sqlite3BtreeEnter(p); + pBt->db = p->db; +#ifndef SQLITE_OMIT_AUTOVACUUM + if( pBt->autoVacuum ){ + rc = autoVacuumCommit(pBt, &nTrunc); + if( rc!=SQLITE_OK ){ + sqlite3BtreeLeave(p); + return rc; + } + } +#endif + rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc); + sqlite3BtreeLeave(p); + } + return rc; +} + +/* ** Commit the transaction currently in progress. ** +** This routine implements the second phase of a 2-phase commit. The +** sqlite3BtreeSync() routine does the first phase and should be invoked +** prior to calling this routine. The sqlite3BtreeSync() routine did +** all the work of writing information out to disk and flushing the +** contents so that they are written onto the disk platter. All this +** routine has to do is delete or truncate the rollback journal +** (which causes the transaction to commit) and drop locks. +** ** This will release the write lock on the database file. If there ** are no active cursors, it also releases the read lock. */ -int sqlite3BtreeCommit(Btree *p){ +int sqlite3BtreeCommitPhaseTwo(Btree *p){ BtShared *pBt = p->pBt; + sqlite3BtreeEnter(p); + pBt->db = p->db; btreeIntegrity(p); /* If the handle has a write-transaction open, commit the shared-btrees @@ -2501,8 +2393,9 @@ int sqlite3BtreeCommit(Btree *p){ int rc; assert( pBt->inTransaction==TRANS_WRITE ); assert( pBt->nTransaction>0 ); - rc = sqlite3pager_commit(pBt->pPager); + rc = sqlite3PagerCommitPhaseTwo(pBt->pPager); if( rc!=SQLITE_OK ){ + sqlite3BtreeLeave(p); return rc; } pBt->inTransaction = TRANS_READ; @@ -2529,43 +2422,72 @@ int sqlite3BtreeCommit(Btree *p){ unlockBtreeIfUnused(pBt); btreeIntegrity(p); + sqlite3BtreeLeave(p); return SQLITE_OK; } +/* +** Do both phases of a commit. +*/ +int sqlite3BtreeCommit(Btree *p){ + int rc; + sqlite3BtreeEnter(p); + rc = sqlite3BtreeCommitPhaseOne(p, 0); + if( rc==SQLITE_OK ){ + rc = sqlite3BtreeCommitPhaseTwo(p); + } + sqlite3BtreeLeave(p); + return rc; +} + #ifndef NDEBUG /* ** Return the number of write-cursors open on this handle. This is for use ** in assert() expressions, so it is only compiled if NDEBUG is not ** defined. +** +** For the purposes of this routine, a write-cursor is any cursor that +** is capable of writing to the databse. That means the cursor was +** originally opened for writing and the cursor has not be disabled +** by having its state changed to CURSOR_FAULT. */ static int countWriteCursors(BtShared *pBt){ BtCursor *pCur; int r = 0; for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ - if( pCur->wrFlag ) r++; + if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++; } return r; } #endif -#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) /* -** Print debugging information about all cursors to standard output. +** This routine sets the state to CURSOR_FAULT and the error +** code to errCode for every cursor on BtShared that pBtree +** references. +** +** Every cursor is tripped, including cursors that belong +** to other database connections that happen to be sharing +** the cache with pBtree. +** +** This routine gets called when a rollback occurs. +** All cursors using the same cache must be tripped +** to prevent them from trying to use the btree after +** the rollback. The rollback may have deleted tables +** or moved root pages, so it is not sufficient to +** save the state of the cursor. The cursor must be +** invalidated. */ -void sqlite3BtreeCursorList(Btree *p){ - BtCursor *pCur; - BtShared *pBt = p->pBt; - for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ - MemPage *pPage = pCur->pPage; - char *zMode = pCur->wrFlag ? "rw" : "ro"; - sqlite3DebugPrintf("CURSOR %p rooted at %4d(%s) currently at %d.%d%s\n", - pCur, pCur->pgnoRoot, zMode, - pPage ? pPage->pgno : 0, pCur->idx, - (pCur->eState==CURSOR_VALID) ? "" : " eof" - ); +void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){ + BtCursor *p; + sqlite3BtreeEnter(pBtree); + for(p=pBtree->pBt->pCursor; p; p=p->pNext){ + clearCursorPosition(p); + p->eState = CURSOR_FAULT; + p->skip = errCode; } + sqlite3BtreeLeave(pBtree); } -#endif /* ** Rollback the transaction in progress. All cursors will be @@ -2581,6 +2503,8 @@ int sqlite3BtreeRollback(Btree *p){ BtShared *pBt = p->pBt; MemPage *pPage1; + sqlite3BtreeEnter(p); + pBt->db = p->db; rc = saveAllCursors(pBt, 0, 0); #ifndef SQLITE_OMIT_SHARED_CACHE if( rc!=SQLITE_OK ){ @@ -2591,12 +2515,7 @@ int sqlite3BtreeRollback(Btree *p){ ** we cannot simply return the error to the caller. Instead, abort ** all queries that may be using any of the cursors that failed to save. */ - while( pBt->pCursor ){ - sqlite3 *db = pBt->pCursor->pBtree->pSqlite; - if( db ){ - sqlite3AbortOtherActiveVdbes(db, 0); - } - } + sqlite3BtreeTripAllCursors(p, rc); } #endif btreeIntegrity(p); @@ -2605,16 +2524,20 @@ int sqlite3BtreeRollback(Btree *p){ if( p->inTrans==TRANS_WRITE ){ int rc2; +#ifndef SQLITE_OMIT_AUTOVACUUM + pBt->nTrunc = 0; +#endif + assert( TRANS_WRITE==pBt->inTransaction ); - rc2 = sqlite3pager_rollback(pBt->pPager); + rc2 = sqlite3PagerRollback(pBt->pPager); if( rc2!=SQLITE_OK ){ rc = rc2; } /* The rollback may have destroyed the pPage1->aData value. So - ** call getPage() on page 1 again to make sure pPage1->aData is - ** set correctly. */ - if( getPage(pBt, 1, &pPage1)==SQLITE_OK ){ + ** call sqlite3BtreeGetPage() on page 1 again to make + ** sure pPage1->aData is set correctly. */ + if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){ releasePage(pPage1); } assert( countWriteCursors(pBt)==0 ); @@ -2634,6 +2557,7 @@ int sqlite3BtreeRollback(Btree *p){ unlockBtreeIfUnused(pBt); btreeIntegrity(p); + sqlite3BtreeLeave(p); return rc; } @@ -2655,12 +2579,16 @@ int sqlite3BtreeRollback(Btree *p){ int sqlite3BtreeBeginStmt(Btree *p){ int rc; BtShared *pBt = p->pBt; + sqlite3BtreeEnter(p); + pBt->db = p->db; if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){ - return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; + rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; + }else{ + assert( pBt->inTransaction==TRANS_WRITE ); + rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager); + pBt->inStmt = 1; } - assert( pBt->inTransaction==TRANS_WRITE ); - rc = pBt->readOnly ? SQLITE_OK : sqlite3pager_stmt_begin(pBt->pPager); - pBt->inStmt = 1; + sqlite3BtreeLeave(p); return rc; } @@ -2672,12 +2600,15 @@ int sqlite3BtreeBeginStmt(Btree *p){ int sqlite3BtreeCommitStmt(Btree *p){ int rc; BtShared *pBt = p->pBt; + sqlite3BtreeEnter(p); + pBt->db = p->db; if( pBt->inStmt && !pBt->readOnly ){ - rc = sqlite3pager_stmt_commit(pBt->pPager); + rc = sqlite3PagerStmtCommit(pBt->pPager); }else{ rc = SQLITE_OK; } pBt->inStmt = 0; + sqlite3BtreeLeave(p); return rc; } @@ -2692,13 +2623,14 @@ int sqlite3BtreeCommitStmt(Btree *p){ int sqlite3BtreeRollbackStmt(Btree *p){ int rc = SQLITE_OK; BtShared *pBt = p->pBt; - sqlite3MallocDisallow(); + sqlite3BtreeEnter(p); + pBt->db = p->db; if( pBt->inStmt && !pBt->readOnly ){ - rc = sqlite3pager_stmt_rollback(pBt->pPager); + rc = sqlite3PagerStmtRollback(pBt->pPager); assert( countWriteCursors(pBt)==0 ); pBt->inStmt = 0; } - sqlite3MallocAllow(); + sqlite3BtreeLeave(p); return rc; } @@ -2732,25 +2664,16 @@ static int dfltCompare( ** ** 1: The cursor must have been opened with wrFlag==1 ** -** 2: No other cursors may be open with wrFlag==0 on the same table +** 2: Other database connections that share the same pager cache +** but which are not in the READ_UNCOMMITTED state may not have +** cursors open with wrFlag==0 on the same table. Otherwise +** the changes made by this write cursor would be visible to +** the read cursors in the other database connection. ** ** 3: The database must be writable (not on read-only media) ** ** 4: There must be an active transaction. ** -** Condition 2 warrants further discussion. If any cursor is opened -** on a table with wrFlag==0, that prevents all other cursors from -** writing to that table. This is a kind of "read-lock". When a cursor -** is opened with wrFlag==0 it is guaranteed that the table will not -** change as long as the cursor is open. This allows the cursor to -** do a sequential scan of the table without having to worry about -** entries being inserted or deleted during the scan. Cursors should -** be opened with wrFlag==0 only if this read-lock property is needed. -** That is to say, cursors should be opened with wrFlag==0 only if they -** intend to use the sqlite3BtreeNext() system call. All other cursors -** should be opened with wrFlag==1 even if they never really intend -** to write. -** ** No checking is done to make sure that page iTable really is the ** root page of a b-tree. If it is not, then the cursor acquired ** will not work correctly. @@ -2761,7 +2684,7 @@ static int dfltCompare( ** default comparison function is used. The comparison function is ** always ignored for INTKEY tables. */ -int sqlite3BtreeCursor( +static int btreeCursor( Btree *p, /* The btree */ int iTable, /* Root page of table to open */ int wrFlag, /* 1 to write. 0 read-only */ @@ -2773,12 +2696,13 @@ int sqlite3BtreeCursor( BtCursor *pCur; BtShared *pBt = p->pBt; + assert( sqlite3BtreeHoldsMutex(p) ); *ppCur = 0; if( wrFlag ){ if( pBt->readOnly ){ return SQLITE_READONLY; } - if( checkReadLocks(pBt, iTable, 0) ){ + if( checkReadLocks(p, iTable, 0) ){ return SQLITE_LOCKED; } } @@ -2788,14 +2712,17 @@ int sqlite3BtreeCursor( if( rc!=SQLITE_OK ){ return rc; } + if( pBt->readOnly && wrFlag ){ + return SQLITE_READONLY; + } } - pCur = sqliteMalloc( sizeof(*pCur) ); + pCur = sqlite3MallocZero( sizeof(*pCur) ); if( pCur==0 ){ rc = SQLITE_NOMEM; goto create_cursor_exception; } pCur->pgnoRoot = (Pgno)iTable; - if( iTable==1 && sqlite3pager_pagecount(pBt->pPager)==0 ){ + if( iTable==1 && sqlite3PagerPagecount(pBt->pPager)==0 ){ rc = SQLITE_EMPTY; goto create_cursor_exception; } @@ -2811,6 +2738,7 @@ int sqlite3BtreeCursor( pCur->xCompare = xCmp ? xCmp : dfltCompare; pCur->pArg = pArg; pCur->pBtree = p; + pCur->pBt = pBt; pCur->wrFlag = wrFlag; pCur->pNext = pBt->pCursor; if( pCur->pNext ){ @@ -2821,36 +2749,43 @@ int sqlite3BtreeCursor( *ppCur = pCur; return SQLITE_OK; + create_cursor_exception: if( pCur ){ releasePage(pCur->pPage); - sqliteFree(pCur); + sqlite3_free(pCur); } unlockBtreeIfUnused(pBt); return rc; } - -#if 0 /* Not Used */ -/* -** Change the value of the comparison function used by a cursor. -*/ -void sqlite3BtreeSetCompare( - BtCursor *pCur, /* The cursor to whose comparison function is changed */ - int(*xCmp)(void*,int,const void*,int,const void*), /* New comparison func */ - void *pArg /* First argument to xCmp() */ +int sqlite3BtreeCursor( + Btree *p, /* The btree */ + int iTable, /* Root page of table to open */ + int wrFlag, /* 1 to write. 0 read-only */ + int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */ + void *pArg, /* First arg to xCompare() */ + BtCursor **ppCur /* Write new cursor here */ ){ - pCur->xCompare = xCmp ? xCmp : dfltCompare; - pCur->pArg = pArg; + int rc; + sqlite3BtreeEnter(p); + p->pBt->db = p->db; + rc = btreeCursor(p, iTable, wrFlag, xCmp, pArg, ppCur); + sqlite3BtreeLeave(p); + return rc; } -#endif + /* ** Close a cursor. The read lock on the database file is released ** when the last cursor is closed. */ int sqlite3BtreeCloseCursor(BtCursor *pCur){ - BtShared *pBt = pCur->pBtree->pBt; - restoreOrClearCursorPosition(pCur, 0); + BtShared *pBt = pCur->pBt; + Btree *pBtree = pCur->pBtree; + + sqlite3BtreeEnter(pBtree); + pBt->db = pBtree->db; + clearCursorPosition(pCur); if( pCur->pPrev ){ pCur->pPrev->pNext = pCur->pNext; }else{ @@ -2861,7 +2796,9 @@ int sqlite3BtreeCloseCursor(BtCursor *pCur){ } releasePage(pCur->pPage); unlockBtreeIfUnused(pBt); - sqliteFree(pCur); + invalidateOverflowCache(pCur); + sqlite3_free(pCur); + sqlite3BtreeLeave(pBtree); return SQLITE_OK; } @@ -2869,12 +2806,13 @@ int sqlite3BtreeCloseCursor(BtCursor *pCur){ ** Make a temporary cursor by filling in the fields of pTempCur. ** The temporary cursor is not on the cursor list for the Btree. */ -static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){ +void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){ + assert( cursorHoldsMutex(pCur) ); memcpy(pTempCur, pCur, sizeof(*pCur)); pTempCur->pNext = 0; pTempCur->pPrev = 0; if( pTempCur->pPage ){ - sqlite3pager_ref(pTempCur->pPage->aData); + sqlite3PagerRef(pTempCur->pPage->pDbPage); } } @@ -2882,31 +2820,56 @@ static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){ ** Delete a temporary cursor such as was made by the CreateTemporaryCursor() ** function above. */ -static void releaseTempCursor(BtCursor *pCur){ +void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){ + assert( cursorHoldsMutex(pCur) ); if( pCur->pPage ){ - sqlite3pager_unref(pCur->pPage->aData); + sqlite3PagerUnref(pCur->pPage->pDbPage); } } /* -** Make sure the BtCursor.info field of the given cursor is valid. -** If it is not already valid, call parseCell() to fill it in. +** Make sure the BtCursor* given in the argument has a valid +** BtCursor.info structure. If it is not already valid, call +** sqlite3BtreeParseCell() to fill it in. ** ** BtCursor.info is a cache of the information in the current cell. -** Using this cache reduces the number of calls to parseCell(). +** Using this cache reduces the number of calls to sqlite3BtreeParseCell(). +** +** 2007-06-25: There is a bug in some versions of MSVC that cause the +** compiler to crash when getCellInfo() is implemented as a macro. +** But there is a measureable speed advantage to using the macro on gcc +** (when less compiler optimizations like -Os or -O0 are used and the +** compiler is not doing agressive inlining.) So we use a real function +** for MSVC and a macro for everything else. Ticket #2457. */ -static void getCellInfo(BtCursor *pCur){ - if( pCur->info.nSize==0 ){ - parseCell(pCur->pPage, pCur->idx, &pCur->info); - }else{ #ifndef NDEBUG + static void assertCellInfo(BtCursor *pCur){ CellInfo info; memset(&info, 0, sizeof(info)); - parseCell(pCur->pPage, pCur->idx, &info); + sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &info); assert( memcmp(&info, &pCur->info, sizeof(info))==0 ); + } +#else + #define assertCellInfo(x) #endif +#ifdef _MSC_VER + /* Use a real function in MSVC to work around bugs in that compiler. */ + static void getCellInfo(BtCursor *pCur){ + if( pCur->info.nSize==0 ){ + sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info); + }else{ + assertCellInfo(pCur); + } } -} +#else /* if not _MSC_VER */ + /* Use a macro in all other compilers so that the function is inlined */ +#define getCellInfo(pCur) \ + if( pCur->info.nSize==0 ){ \ + sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info); \ + }else{ \ + assertCellInfo(pCur); \ + } +#endif /* _MSC_VER */ /* ** Set *pSize to the size of the buffer needed to hold the value of @@ -2917,7 +2880,10 @@ static void getCellInfo(BtCursor *pCur){ ** itself, not the number of bytes in the key. */ int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){ - int rc = restoreOrClearCursorPosition(pCur, 1); + int rc; + + assert( cursorHoldsMutex(pCur) ); + rc = restoreOrClearCursorPosition(pCur); if( rc==SQLITE_OK ){ assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID ); if( pCur->eState==CURSOR_INVALID ){ @@ -2938,7 +2904,10 @@ int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){ ** the database is empty) then *pSize is set to 0. */ int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){ - int rc = restoreOrClearCursorPosition(pCur, 1); + int rc; + + assert( cursorHoldsMutex(pCur) ); + rc = restoreOrClearCursorPosition(pCur); if( rc==SQLITE_OK ){ assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID ); if( pCur->eState==CURSOR_INVALID ){ @@ -2953,93 +2922,284 @@ int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){ } /* -** Read payload information from the entry that the pCur cursor is -** pointing to. Begin reading the payload at "offset" and read -** a total of "amt" bytes. Put the result in zBuf. +** Given the page number of an overflow page in the database (parameter +** ovfl), this function finds the page number of the next page in the +** linked list of overflow pages. If possible, it uses the auto-vacuum +** pointer-map data instead of reading the content of page ovfl to do so. ** -** This routine does not make a distinction between key and data. -** It just reads bytes from the payload area. Data might appear -** on the main page or be scattered out on multiple overflow pages. +** If an error occurs an SQLite error code is returned. Otherwise: +** +** Unless pPgnoNext is NULL, the page number of the next overflow +** page in the linked list is written to *pPgnoNext. If page ovfl +** is the last page in its linked list, *pPgnoNext is set to zero. +** +** If ppPage is not NULL, *ppPage is set to the MemPage* handle +** for page ovfl. The underlying pager page may have been requested +** with the noContent flag set, so the page data accessable via +** this handle may not be trusted. */ -static int getPayload( +static int getOverflowPage( + BtShared *pBt, + Pgno ovfl, /* Overflow page */ + MemPage **ppPage, /* OUT: MemPage handle */ + Pgno *pPgnoNext /* OUT: Next overflow page number */ +){ + Pgno next = 0; + int rc; + + assert( sqlite3_mutex_held(pBt->mutex) ); + /* One of these must not be NULL. Otherwise, why call this function? */ + assert(ppPage || pPgnoNext); + + /* If pPgnoNext is NULL, then this function is being called to obtain + ** a MemPage* reference only. No page-data is required in this case. + */ + if( !pPgnoNext ){ + return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1); + } + +#ifndef SQLITE_OMIT_AUTOVACUUM + /* Try to find the next page in the overflow list using the + ** autovacuum pointer-map pages. Guess that the next page in + ** the overflow list is page number (ovfl+1). If that guess turns + ** out to be wrong, fall back to loading the data of page + ** number ovfl to determine the next page number. + */ + if( pBt->autoVacuum ){ + Pgno pgno; + Pgno iGuess = ovfl+1; + u8 eType; + + while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){ + iGuess++; + } + + if( iGuess<=sqlite3PagerPagecount(pBt->pPager) ){ + rc = ptrmapGet(pBt, iGuess, &eType, &pgno); + if( rc!=SQLITE_OK ){ + return rc; + } + if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){ + next = iGuess; + } + } + } +#endif + + if( next==0 || ppPage ){ + MemPage *pPage = 0; + + rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0); + assert(rc==SQLITE_OK || pPage==0); + if( next==0 && rc==SQLITE_OK ){ + next = get4byte(pPage->aData); + } + + if( ppPage ){ + *ppPage = pPage; + }else{ + releasePage(pPage); + } + } + *pPgnoNext = next; + + return rc; +} + +/* +** Copy data from a buffer to a page, or from a page to a buffer. +** +** pPayload is a pointer to data stored on database page pDbPage. +** If argument eOp is false, then nByte bytes of data are copied +** from pPayload to the buffer pointed at by pBuf. If eOp is true, +** then sqlite3PagerWrite() is called on pDbPage and nByte bytes +** of data are copied from the buffer pBuf to pPayload. +** +** SQLITE_OK is returned on success, otherwise an error code. +*/ +static int copyPayload( + void *pPayload, /* Pointer to page data */ + void *pBuf, /* Pointer to buffer */ + int nByte, /* Number of bytes to copy */ + int eOp, /* 0 -> copy from page, 1 -> copy to page */ + DbPage *pDbPage /* Page containing pPayload */ +){ + if( eOp ){ + /* Copy data from buffer to page (a write operation) */ + int rc = sqlite3PagerWrite(pDbPage); + if( rc!=SQLITE_OK ){ + return rc; + } + memcpy(pPayload, pBuf, nByte); + }else{ + /* Copy data from page to buffer (a read operation) */ + memcpy(pBuf, pPayload, nByte); + } + return SQLITE_OK; +} + +/* +** This function is used to read or overwrite payload information +** for the entry that the pCur cursor is pointing to. If the eOp +** parameter is 0, this is a read operation (data copied into +** buffer pBuf). If it is non-zero, a write (data copied from +** buffer pBuf). +** +** A total of "amt" bytes are read or written beginning at "offset". +** Data is read to or from the buffer pBuf. +** +** This routine does not make a distinction between key and data. +** It just reads or writes bytes from the payload area. Data might +** appear on the main page or be scattered out on multiple overflow +** pages. +** +** If the BtCursor.isIncrblobHandle flag is set, and the current +** cursor entry uses one or more overflow pages, this function +** allocates space for and lazily popluates the overflow page-list +** cache array (BtCursor.aOverflow). Subsequent calls use this +** cache to make seeking to the supplied offset more efficient. +** +** Once an overflow page-list cache has been allocated, it may be +** invalidated if some other cursor writes to the same table, or if +** the cursor is moved to a different row. Additionally, in auto-vacuum +** mode, the following events may invalidate an overflow page-list cache. +** +** * An incremental vacuum, +** * A commit in auto_vacuum="full" mode, +** * Creating a table (may require moving an overflow page). +*/ +static int accessPayload( BtCursor *pCur, /* Cursor pointing to entry to read from */ int offset, /* Begin reading this far into payload */ int amt, /* Read this many bytes */ unsigned char *pBuf, /* Write the bytes into this buffer */ - int skipKey /* offset begins at data if this is true */ + int skipKey, /* offset begins at data if this is true */ + int eOp /* zero to read. non-zero to write. */ ){ unsigned char *aPayload; - Pgno nextPage; - int rc; - MemPage *pPage; - BtShared *pBt; - int ovflSize; + int rc = SQLITE_OK; u32 nKey; + int iIdx = 0; + MemPage *pPage = pCur->pPage; /* Btree page of current cursor entry */ + BtShared *pBt; /* Btree this cursor belongs to */ - assert( pCur!=0 && pCur->pPage!=0 ); + assert( pPage ); assert( pCur->eState==CURSOR_VALID ); - pBt = pCur->pBtree->pBt; - pPage = pCur->pPage; - pageIntegrity(pPage); assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); + assert( offset>=0 ); + assert( cursorHoldsMutex(pCur) ); + getCellInfo(pCur); aPayload = pCur->info.pCell + pCur->info.nHeader; - if( pPage->intKey ){ - nKey = 0; - }else{ - nKey = pCur->info.nKey; - } - assert( offset>=0 ); + nKey = (pPage->intKey ? 0 : pCur->info.nKey); + if( skipKey ){ offset += nKey; } if( offset+amt > nKey+pCur->info.nData ){ + /* Trying to read or write past the end of the data is an error */ return SQLITE_ERROR; } + + /* Check if data must be read/written to/from the btree page itself. */ if( offset<pCur->info.nLocal ){ int a = amt; if( a+offset>pCur->info.nLocal ){ a = pCur->info.nLocal - offset; } - memcpy(pBuf, &aPayload[offset], a); - if( a==amt ){ - return SQLITE_OK; - } + rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage); offset = 0; pBuf += a; amt -= a; }else{ offset -= pCur->info.nLocal; } - ovflSize = pBt->usableSize - 4; - if( amt>0 ){ + + pBt = pCur->pBt; + if( rc==SQLITE_OK && amt>0 ){ + const int ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */ + Pgno nextPage; + nextPage = get4byte(&aPayload[pCur->info.nLocal]); - while( amt>0 && nextPage ){ - rc = sqlite3pager_get(pBt->pPager, nextPage, (void**)&aPayload); - if( rc!=0 ){ - return rc; + +#ifndef SQLITE_OMIT_INCRBLOB + /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[] + ** has not been allocated, allocate it now. The array is sized at + ** one entry for each overflow page in the overflow chain. The + ** page number of the first overflow page is stored in aOverflow[0], + ** etc. A value of 0 in the aOverflow[] array means "not yet known" + ** (the cache is lazily populated). + */ + if( pCur->isIncrblobHandle && !pCur->aOverflow ){ + int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize; + pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl); + if( nOvfl && !pCur->aOverflow ){ + rc = SQLITE_NOMEM; + } + } + + /* If the overflow page-list cache has been allocated and the + ** entry for the first required overflow page is valid, skip + ** directly to it. + */ + if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){ + iIdx = (offset/ovflSize); + nextPage = pCur->aOverflow[iIdx]; + offset = (offset%ovflSize); + } +#endif + + for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){ + +#ifndef SQLITE_OMIT_INCRBLOB + /* If required, populate the overflow page-list cache. */ + if( pCur->aOverflow ){ + assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage); + pCur->aOverflow[iIdx] = nextPage; } - nextPage = get4byte(aPayload); - if( offset<ovflSize ){ +#endif + + if( offset>=ovflSize ){ + /* The only reason to read this page is to obtain the page + ** number for the next page in the overflow chain. The page + ** data is not required. So first try to lookup the overflow + ** page-list cache, if any, then fall back to the getOverflowPage() + ** function. + */ +#ifndef SQLITE_OMIT_INCRBLOB + if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){ + nextPage = pCur->aOverflow[iIdx+1]; + } else +#endif + rc = getOverflowPage(pBt, nextPage, 0, &nextPage); + offset -= ovflSize; + }else{ + /* Need to read this page properly. It contains some of the + ** range of data that is being read (eOp==0) or written (eOp!=0). + */ + DbPage *pDbPage; int a = amt; - if( a + offset > ovflSize ){ - a = ovflSize - offset; + rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage); + if( rc==SQLITE_OK ){ + aPayload = sqlite3PagerGetData(pDbPage); + nextPage = get4byte(aPayload); + if( a + offset > ovflSize ){ + a = ovflSize - offset; + } + rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage); + sqlite3PagerUnref(pDbPage); + offset = 0; + amt -= a; + pBuf += a; } - memcpy(pBuf, &aPayload[offset+4], a); - offset = 0; - amt -= a; - pBuf += a; - }else{ - offset -= ovflSize; } - sqlite3pager_unref(aPayload); } } - if( amt>0 ){ + if( rc==SQLITE_OK && amt>0 ){ return SQLITE_CORRUPT_BKPT; } - return SQLITE_OK; + return rc; } /* @@ -3052,7 +3212,10 @@ static int getPayload( ** the available payload. */ int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ - int rc = restoreOrClearCursorPosition(pCur, 1); + int rc; + + assert( cursorHoldsMutex(pCur) ); + rc = restoreOrClearCursorPosition(pCur); if( rc==SQLITE_OK ){ assert( pCur->eState==CURSOR_VALID ); assert( pCur->pPage!=0 ); @@ -3061,7 +3224,7 @@ int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ } assert( pCur->pPage->intKey==0 ); assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell ); - rc = getPayload(pCur, offset, amt, (unsigned char*)pBuf, 0); + rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0); } return rc; } @@ -3076,12 +3239,15 @@ int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ ** the available payload. */ int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ - int rc = restoreOrClearCursorPosition(pCur, 1); + int rc; + + assert( cursorHoldsMutex(pCur) ); + rc = restoreOrClearCursorPosition(pCur); if( rc==SQLITE_OK ){ assert( pCur->eState==CURSOR_VALID ); assert( pCur->pPage!=0 ); assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell ); - rc = getPayload(pCur, offset, amt, pBuf, 1); + rc = accessPayload(pCur, offset, amt, pBuf, 1, 0); } return rc; } @@ -3098,7 +3264,7 @@ int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ ** and data to fit on the local page and for there to be no overflow ** pages. When that is so, this routine can be used to access the ** key and data without making a copy. If the key and/or data spills -** onto overflow pages, then getPayload() must be used to reassembly +** onto overflow pages, then accessPayload() must be used to reassembly ** the key/data and copy it into a preallocated buffer. ** ** The pointer returned by this routine looks directly into the cached @@ -3117,8 +3283,8 @@ static const unsigned char *fetchPayload( assert( pCur!=0 && pCur->pPage!=0 ); assert( pCur->eState==CURSOR_VALID ); + assert( cursorHoldsMutex(pCur) ); pPage = pCur->pPage; - pageIntegrity(pPage); assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); getCellInfo(pCur); aPayload = pCur->info.pCell; @@ -3148,18 +3314,23 @@ static const unsigned char *fetchPayload( ** b-tree page. Write the number of available bytes into *pAmt. ** ** The pointer returned is ephemeral. The key/data may move -** or be destroyed on the next call to any Btree routine. +** or be destroyed on the next call to any Btree routine, +** including calls from other threads against the same cache. +** Hence, a mutex on the BtShared should be held prior to calling +** this routine. ** ** These routines is used to get quick access to key and data ** in the common case where no overflow pages are used. */ const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){ + assert( cursorHoldsMutex(pCur) ); if( pCur->eState==CURSOR_VALID ){ return (const void*)fetchPayload(pCur, pAmt, 0); } return 0; } const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){ + assert( cursorHoldsMutex(pCur) ); if( pCur->eState==CURSOR_VALID ){ return (const void*)fetchPayload(pCur, pAmt, 1); } @@ -3175,12 +3346,12 @@ static int moveToChild(BtCursor *pCur, u32 newPgno){ int rc; MemPage *pNewPage; MemPage *pOldPage; - BtShared *pBt = pCur->pBtree->pBt; + BtShared *pBt = pCur->pBt; + assert( cursorHoldsMutex(pCur) ); assert( pCur->eState==CURSOR_VALID ); rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage); if( rc ) return rc; - pageIntegrity(pNewPage); pNewPage->idxParent = pCur->idx; pOldPage = pCur->pPage; pOldPage->idxShift = 0; @@ -3203,8 +3374,11 @@ static int moveToChild(BtCursor *pCur, u32 newPgno){ ** virtual root page is the page that the right-pointer of page ** 1 is pointing to. */ -static int isRootPage(MemPage *pPage){ - MemPage *pParent = pPage->pParent; +int sqlite3BtreeIsRootPage(MemPage *pPage){ + MemPage *pParent; + + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); + pParent = pPage->pParent; if( pParent==0 ) return 1; if( pParent->pgno>1 ) return 0; if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1; @@ -3219,21 +3393,20 @@ static int isRootPage(MemPage *pPage){ ** right-most child page then pCur->idx is set to one more than ** the largest cell index. */ -static void moveToParent(BtCursor *pCur){ +void sqlite3BtreeMoveToParent(BtCursor *pCur){ MemPage *pParent; MemPage *pPage; int idxParent; + assert( cursorHoldsMutex(pCur) ); assert( pCur->eState==CURSOR_VALID ); pPage = pCur->pPage; assert( pPage!=0 ); - assert( !isRootPage(pPage) ); - pageIntegrity(pPage); + assert( !sqlite3BtreeIsRootPage(pPage) ); pParent = pPage->pParent; assert( pParent!=0 ); - pageIntegrity(pParent); idxParent = pPage->idxParent; - sqlite3pager_ref(pParent->aData); + sqlite3PagerRef(pParent->pDbPage); releasePage(pPage); pCur->pPage = pParent; pCur->info.nSize = 0; @@ -3247,9 +3420,19 @@ static void moveToParent(BtCursor *pCur){ static int moveToRoot(BtCursor *pCur){ MemPage *pRoot; int rc = SQLITE_OK; - BtShared *pBt = pCur->pBtree->pBt; + Btree *p = pCur->pBtree; + BtShared *pBt = p->pBt; - restoreOrClearCursorPosition(pCur, 0); + assert( cursorHoldsMutex(pCur) ); + assert( CURSOR_INVALID < CURSOR_REQUIRESEEK ); + assert( CURSOR_VALID < CURSOR_REQUIRESEEK ); + assert( CURSOR_FAULT > CURSOR_REQUIRESEEK ); + if( pCur->eState>=CURSOR_REQUIRESEEK ){ + if( pCur->eState==CURSOR_FAULT ){ + return pCur->skip; + } + clearCursorPosition(pCur); + } pRoot = pCur->pPage; if( pRoot && pRoot->pgno==pCur->pgnoRoot ){ assert( pRoot->isInit ); @@ -3261,7 +3444,6 @@ static int moveToRoot(BtCursor *pCur){ return rc; } releasePage(pCur->pPage); - pageIntegrity(pRoot); pCur->pPage = pRoot; } pCur->idx = 0; @@ -3287,17 +3469,17 @@ static int moveToRoot(BtCursor *pCur){ */ static int moveToLeftmost(BtCursor *pCur){ Pgno pgno; - int rc; + int rc = SQLITE_OK; MemPage *pPage; + assert( cursorHoldsMutex(pCur) ); assert( pCur->eState==CURSOR_VALID ); - while( !(pPage = pCur->pPage)->leaf ){ + while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){ assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); pgno = get4byte(findCell(pPage, pCur->idx)); rc = moveToChild(pCur, pgno); - if( rc ) return rc; } - return SQLITE_OK; + return rc; } /* @@ -3312,18 +3494,20 @@ static int moveToLeftmost(BtCursor *pCur){ */ static int moveToRightmost(BtCursor *pCur){ Pgno pgno; - int rc; + int rc = SQLITE_OK; MemPage *pPage; + assert( cursorHoldsMutex(pCur) ); assert( pCur->eState==CURSOR_VALID ); - while( !(pPage = pCur->pPage)->leaf ){ + while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){ pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); pCur->idx = pPage->nCell; rc = moveToChild(pCur, pgno); - if( rc ) return rc; } - pCur->idx = pPage->nCell - 1; - pCur->info.nSize = 0; + if( rc==SQLITE_OK ){ + pCur->idx = pPage->nCell - 1; + pCur->info.nSize = 0; + } return SQLITE_OK; } @@ -3333,16 +3517,21 @@ static int moveToRightmost(BtCursor *pCur){ */ int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ int rc; + + assert( cursorHoldsMutex(pCur) ); + assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); rc = moveToRoot(pCur); - if( rc ) return rc; - if( pCur->eState==CURSOR_INVALID ){ - assert( pCur->pPage->nCell==0 ); - *pRes = 1; - return SQLITE_OK; + if( rc==SQLITE_OK ){ + if( pCur->eState==CURSOR_INVALID ){ + assert( pCur->pPage->nCell==0 ); + *pRes = 1; + rc = SQLITE_OK; + }else{ + assert( pCur->pPage->nCell>0 ); + *pRes = 0; + rc = moveToLeftmost(pCur); + } } - assert( pCur->pPage->nCell>0 ); - *pRes = 0; - rc = moveToLeftmost(pCur); return rc; } @@ -3352,16 +3541,20 @@ int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ */ int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ int rc; + + assert( cursorHoldsMutex(pCur) ); + assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); rc = moveToRoot(pCur); - if( rc ) return rc; - if( CURSOR_INVALID==pCur->eState ){ - assert( pCur->pPage->nCell==0 ); - *pRes = 1; - return SQLITE_OK; + if( rc==SQLITE_OK ){ + if( CURSOR_INVALID==pCur->eState ){ + assert( pCur->pPage->nCell==0 ); + *pRes = 1; + }else{ + assert( pCur->eState==CURSOR_VALID ); + *pRes = 0; + rc = moveToRightmost(pCur); + } } - assert( pCur->eState==CURSOR_VALID ); - *pRes = 0; - rc = moveToRightmost(pCur); return rc; } @@ -3391,21 +3584,31 @@ int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ ** ** *pRes>0 The cursor is left pointing at an entry that ** is larger than pKey. +** */ -int sqlite3BtreeMoveto(BtCursor *pCur, const void *pKey, i64 nKey, int *pRes){ +int sqlite3BtreeMoveto( + BtCursor *pCur, /* The cursor to be moved */ + const void *pKey, /* The key content for indices. Not used by tables */ + i64 nKey, /* Size of pKey. Or the key for tables */ + int biasRight, /* If true, bias the search to the high end */ + int *pRes /* Search result flag */ +){ int rc; - int tryRightmost; + + assert( cursorHoldsMutex(pCur) ); + assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); rc = moveToRoot(pCur); - if( rc ) return rc; + if( rc ){ + return rc; + } assert( pCur->pPage ); assert( pCur->pPage->isInit ); - tryRightmost = pCur->pPage->intKey; if( pCur->eState==CURSOR_INVALID ){ *pRes = -1; assert( pCur->pPage->nCell==0 ); return SQLITE_OK; } - for(;;){ + for(;;){ int lwr, upr; Pgno chldPg; MemPage *pPage = pCur->pPage; @@ -3415,17 +3618,17 @@ int sqlite3BtreeMoveto(BtCursor *pCur, const void *pKey, i64 nKey, int *pRes){ if( !pPage->intKey && pKey==0 ){ return SQLITE_CORRUPT_BKPT; } - pageIntegrity(pPage); - while( lwr<=upr ){ + if( biasRight ){ + pCur->idx = upr; + }else{ + pCur->idx = (upr+lwr)/2; + } + if( lwr<=upr ) for(;;){ void *pCellKey; i64 nCellKey; - pCur->idx = (lwr+upr)/2; pCur->info.nSize = 0; if( pPage->intKey ){ u8 *pCell; - if( tryRightmost ){ - pCur->idx = upr; - } pCell = findCell(pPage, pCur->idx) + pPage->childPtrSize; if( pPage->hasData ){ u32 dummy; @@ -3436,7 +3639,6 @@ int sqlite3BtreeMoveto(BtCursor *pCur, const void *pKey, i64 nKey, int *pRes){ c = -1; }else if( nCellKey>nKey ){ c = +1; - tryRightmost = 0; }else{ c = 0; } @@ -3447,12 +3649,14 @@ int sqlite3BtreeMoveto(BtCursor *pCur, const void *pKey, i64 nKey, int *pRes){ if( available>=nCellKey ){ c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey); }else{ - pCellKey = sqliteMallocRaw( nCellKey ); + pCellKey = sqlite3_malloc( nCellKey ); if( pCellKey==0 ) return SQLITE_NOMEM; rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey); c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey); - sqliteFree(pCellKey); - if( rc ) return rc; + sqlite3_free(pCellKey); + if( rc ){ + return rc; + } } } if( c==0 ){ @@ -3470,6 +3674,10 @@ int sqlite3BtreeMoveto(BtCursor *pCur, const void *pKey, i64 nKey, int *pRes){ }else{ upr = pCur->idx-1; } + if( lwr>upr ){ + break; + } + pCur->idx = (lwr+upr)/2; } assert( lwr==upr+1 ); assert( pPage->isInit ); @@ -3495,6 +3703,7 @@ int sqlite3BtreeMoveto(BtCursor *pCur, const void *pKey, i64 nKey, int *pRes){ /* NOT REACHED */ } + /* ** Return TRUE if the cursor is not pointing at an entry of the table. ** @@ -3511,34 +3720,41 @@ int sqlite3BtreeEof(BtCursor *pCur){ } /* +** Return the database connection handle for a cursor. +*/ +sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){ + assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); + return pCur->pBtree->db; +} + +/* ** Advance the cursor to the next entry in the database. If ** successful then set *pRes=0. If the cursor ** was already pointing to the last entry in the database before ** this routine was called, then set *pRes=1. */ -int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ +static int btreeNext(BtCursor *pCur, int *pRes){ int rc; MemPage *pPage; -#ifndef SQLITE_OMIT_SHARED_CACHE - rc = restoreOrClearCursorPosition(pCur, 1); + assert( cursorHoldsMutex(pCur) ); + rc = restoreOrClearCursorPosition(pCur); if( rc!=SQLITE_OK ){ return rc; } + assert( pRes!=0 ); + pPage = pCur->pPage; + if( CURSOR_INVALID==pCur->eState ){ + *pRes = 1; + return SQLITE_OK; + } if( pCur->skip>0 ){ pCur->skip = 0; *pRes = 0; return SQLITE_OK; } pCur->skip = 0; -#endif - assert( pRes!=0 ); - pPage = pCur->pPage; - if( CURSOR_INVALID==pCur->eState ){ - *pRes = 1; - return SQLITE_OK; - } assert( pPage->isInit ); assert( pCur->idx<pPage->nCell ); @@ -3553,12 +3769,12 @@ int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ return rc; } do{ - if( isRootPage(pPage) ){ + if( sqlite3BtreeIsRootPage(pPage) ){ *pRes = 1; pCur->eState = CURSOR_INVALID; return SQLITE_OK; } - moveToParent(pCur); + sqlite3BtreeMoveToParent(pCur); pPage = pCur->pPage; }while( pCur->idx>=pPage->nCell ); *pRes = 0; @@ -3576,6 +3792,13 @@ int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ rc = moveToLeftmost(pCur); return rc; } +int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ + int rc; + assert( cursorHoldsMutex(pCur) ); + rc = btreeNext(pCur, pRes); + return rc; +} + /* ** Step the cursor to the back to the previous entry in the database. If @@ -3583,28 +3806,26 @@ int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ ** was already pointing to the first entry in the database before ** this routine was called, then set *pRes=1. */ -int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ +static int btreePrevious(BtCursor *pCur, int *pRes){ int rc; Pgno pgno; MemPage *pPage; -#ifndef SQLITE_OMIT_SHARED_CACHE - rc = restoreOrClearCursorPosition(pCur, 1); + assert( cursorHoldsMutex(pCur) ); + rc = restoreOrClearCursorPosition(pCur); if( rc!=SQLITE_OK ){ return rc; } + if( CURSOR_INVALID==pCur->eState ){ + *pRes = 1; + return SQLITE_OK; + } if( pCur->skip<0 ){ pCur->skip = 0; *pRes = 0; return SQLITE_OK; } pCur->skip = 0; -#endif - - if( CURSOR_INVALID==pCur->eState ){ - *pRes = 1; - return SQLITE_OK; - } pPage = pCur->pPage; assert( pPage->isInit ); @@ -3612,16 +3833,18 @@ int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ if( !pPage->leaf ){ pgno = get4byte( findCell(pPage, pCur->idx) ); rc = moveToChild(pCur, pgno); - if( rc ) return rc; + if( rc ){ + return rc; + } rc = moveToRightmost(pCur); }else{ while( pCur->idx==0 ){ - if( isRootPage(pPage) ){ + if( sqlite3BtreeIsRootPage(pPage) ){ pCur->eState = CURSOR_INVALID; *pRes = 1; return SQLITE_OK; } - moveToParent(pCur); + sqlite3BtreeMoveToParent(pCur); pPage = pCur->pPage; } pCur->idx--; @@ -3635,18 +3858,24 @@ int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ *pRes = 0; return rc; } +int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ + int rc; + assert( cursorHoldsMutex(pCur) ); + rc = btreePrevious(pCur, pRes); + return rc; +} /* ** Allocate a new page from the database file. ** -** The new page is marked as dirty. (In other words, sqlite3pager_write() +** The new page is marked as dirty. (In other words, sqlite3PagerWrite() ** has already been called on the new page.) The new page has also ** been referenced and the calling routine is responsible for calling -** sqlite3pager_unref() on the new page when it is done. +** sqlite3PagerUnref() on the new page when it is done. ** ** SQLITE_OK is returned on success. Any other return value indicates ** an error. *ppPage and *pPgno are undefined in the event of an error. -** Do not invoke sqlite3pager_unref() on *ppPage if an error is returned. +** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned. ** ** If the "nearby" parameter is not 0, then a (feeble) effort is made to ** locate a page close to the page number "nearby". This can be used in an @@ -3657,7 +3886,7 @@ int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ ** anywhere on the free-list, then it is guarenteed to be returned. This ** is only used by auto-vacuum databases when allocating a new table. */ -static int allocatePage( +static int allocateBtreePage( BtShared *pBt, MemPage **ppPage, Pgno *pPgno, @@ -3668,14 +3897,15 @@ static int allocatePage( int rc; int n; /* Number of pages on the freelist */ int k; /* Number of leaves on the trunk of the freelist */ + MemPage *pTrunk = 0; + MemPage *pPrevTrunk = 0; + assert( sqlite3_mutex_held(pBt->mutex) ); pPage1 = pBt->pPage1; n = get4byte(&pPage1->aData[36]); if( n>0 ){ /* There are pages on the freelist. Reuse one of those pages. */ - MemPage *pTrunk = 0; Pgno iTrunk; - MemPage *pPrevTrunk = 0; u8 searchList = 0; /* If the free-list must be searched for 'nearby' */ /* If the 'exact' parameter was true and a query of the pointer-map @@ -3683,7 +3913,7 @@ static int allocatePage( ** the entire-list will be searched for that page. */ #ifndef SQLITE_OMIT_AUTOVACUUM - if( exact ){ + if( exact && nearby<=sqlite3PagerPagecount(pBt->pPager) ){ u8 eType; assert( nearby>0 ); assert( pBt->autoVacuum ); @@ -3699,7 +3929,7 @@ static int allocatePage( /* Decrement the free-list count by 1. Set iTrunk to the index of the ** first free-list trunk page. iPrevTrunk is initially 1. */ - rc = sqlite3pager_write(pPage1->aData); + rc = sqlite3PagerWrite(pPage1->pDbPage); if( rc ) return rc; put4byte(&pPage1->aData[36], n-1); @@ -3714,18 +3944,10 @@ static int allocatePage( }else{ iTrunk = get4byte(&pPage1->aData[32]); } - rc = getPage(pBt, iTrunk, &pTrunk); + rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0); if( rc ){ - releasePage(pPrevTrunk); - return rc; - } - - /* TODO: This should move to after the loop? */ - rc = sqlite3pager_write(pTrunk->aData); - if( rc ){ - releasePage(pTrunk); - releasePage(pPrevTrunk); - return rc; + pTrunk = 0; + goto end_allocate_page; } k = get4byte(&pTrunk->aData[4]); @@ -3734,6 +3956,10 @@ static int allocatePage( ** So extract the trunk page itself and use it as the newly ** allocated page */ assert( pPrevTrunk==0 ); + rc = sqlite3PagerWrite(pTrunk->pDbPage); + if( rc ){ + goto end_allocate_page; + } *pPgno = iTrunk; memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); *ppPage = pTrunk; @@ -3741,7 +3967,8 @@ static int allocatePage( TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); }else if( k>pBt->usableSize/4 - 8 ){ /* Value of k is out of range. Database corruption */ - return SQLITE_CORRUPT_BKPT; + rc = SQLITE_CORRUPT_BKPT; + goto end_allocate_page; #ifndef SQLITE_OMIT_AUTOVACUUM }else if( searchList && nearby==iTrunk ){ /* The list is being searched and this trunk page is the page @@ -3750,6 +3977,10 @@ static int allocatePage( assert( *pPgno==iTrunk ); *ppPage = pTrunk; searchList = 0; + rc = sqlite3PagerWrite(pTrunk->pDbPage); + if( rc ){ + goto end_allocate_page; + } if( k==0 ){ if( !pPrevTrunk ){ memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); @@ -3763,28 +3994,28 @@ static int allocatePage( */ MemPage *pNewTrunk; Pgno iNewTrunk = get4byte(&pTrunk->aData[8]); - rc = getPage(pBt, iNewTrunk, &pNewTrunk); + rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0); if( rc!=SQLITE_OK ){ - releasePage(pTrunk); - releasePage(pPrevTrunk); - return rc; + goto end_allocate_page; } - rc = sqlite3pager_write(pNewTrunk->aData); + rc = sqlite3PagerWrite(pNewTrunk->pDbPage); if( rc!=SQLITE_OK ){ releasePage(pNewTrunk); - releasePage(pTrunk); - releasePage(pPrevTrunk); - return rc; + goto end_allocate_page; } memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4); put4byte(&pNewTrunk->aData[4], k-1); memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4); + releasePage(pNewTrunk); if( !pPrevTrunk ){ put4byte(&pPage1->aData[32], iNewTrunk); }else{ + rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); + if( rc ){ + goto end_allocate_page; + } put4byte(&pPrevTrunk->aData[0], iNewTrunk); } - releasePage(pNewTrunk); } pTrunk = 0; TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); @@ -3794,6 +4025,10 @@ static int allocatePage( int closest; Pgno iPage; unsigned char *aData = pTrunk->aData; + rc = sqlite3PagerWrite(pTrunk->pDbPage); + if( rc ){ + goto end_allocate_page; + } if( nearby>0 ){ int i, dist; closest = 0; @@ -3814,7 +4049,7 @@ static int allocatePage( iPage = get4byte(&aData[8+closest*4]); if( !searchList || iPage==nearby ){ *pPgno = iPage; - if( *pPgno>sqlite3pager_pagecount(pBt->pPager) ){ + if( *pPgno>sqlite3PagerPagecount(pBt->pPager) ){ /* Free page off the end of the file */ return SQLITE_CORRUPT_BKPT; } @@ -3825,10 +4060,10 @@ static int allocatePage( memcpy(&aData[8+closest*4], &aData[4+k*4], 4); } put4byte(&aData[4], k-1); - rc = getPage(pBt, *pPgno, ppPage); + rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1); if( rc==SQLITE_OK ){ - sqlite3pager_dont_rollback((*ppPage)->aData); - rc = sqlite3pager_write((*ppPage)->aData); + sqlite3PagerDontRollback((*ppPage)->pDbPage); + rc = sqlite3PagerWrite((*ppPage)->pDbPage); if( rc!=SQLITE_OK ){ releasePage(*ppPage); } @@ -3837,14 +4072,24 @@ static int allocatePage( } } releasePage(pPrevTrunk); + pPrevTrunk = 0; }while( searchList ); - releasePage(pTrunk); }else{ /* There are no pages on the freelist, so create a new page at the ** end of the file */ - *pPgno = sqlite3pager_pagecount(pBt->pPager) + 1; + *pPgno = sqlite3PagerPagecount(pBt->pPager) + 1; #ifndef SQLITE_OMIT_AUTOVACUUM + if( pBt->nTrunc ){ + /* An incr-vacuum has already run within this transaction. So the + ** page to allocate is not from the physical end of the file, but + ** at pBt->nTrunc. + */ + *pPgno = pBt->nTrunc+1; + if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ + (*pPgno)++; + } + } if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){ /* If *pPgno refers to a pointer-map page, allocate two new pages ** at the end of the file instead of one. The first allocated page @@ -3853,13 +4098,17 @@ static int allocatePage( TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno)); assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); (*pPgno)++; + if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; } + } + if( pBt->nTrunc ){ + pBt->nTrunc = *pPgno; } #endif assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); - rc = getPage(pBt, *pPgno, ppPage); + rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0); if( rc ) return rc; - rc = sqlite3pager_write((*ppPage)->aData); + rc = sqlite3PagerWrite((*ppPage)->pDbPage); if( rc!=SQLITE_OK ){ releasePage(*ppPage); } @@ -3867,13 +4116,17 @@ static int allocatePage( } assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); + +end_allocate_page: + releasePage(pTrunk); + releasePage(pPrevTrunk); return rc; } /* ** Add a page of the database file to the freelist. ** -** sqlite3pager_unref() is NOT called for pPage. +** sqlite3PagerUnref() is NOT called for pPage. */ static int freePage(MemPage *pPage){ BtShared *pBt = pPage->pBt; @@ -3881,13 +4134,14 @@ static int freePage(MemPage *pPage){ int rc, n, k; /* Prepare the page for freeing */ + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); assert( pPage->pgno>1 ); pPage->isInit = 0; releasePage(pPage->pParent); pPage->pParent = 0; /* Increment the free page count on pPage1 */ - rc = sqlite3pager_write(pPage1->aData); + rc = sqlite3PagerWrite(pPage1->pDbPage); if( rc ) return rc; n = get4byte(&pPage1->aData[36]); put4byte(&pPage1->aData[36], n+1); @@ -3896,7 +4150,7 @@ static int freePage(MemPage *pPage){ /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then ** always fully overwrite deleted information with zeros. */ - rc = sqlite3pager_write(pPage->aData); + rc = sqlite3PagerWrite(pPage->pDbPage); if( rc ) return rc; memset(pPage->aData, 0, pPage->pBt->pageSize); #endif @@ -3913,7 +4167,7 @@ static int freePage(MemPage *pPage){ if( n==0 ){ /* This is the first free page */ - rc = sqlite3pager_write(pPage->aData); + rc = sqlite3PagerWrite(pPage->pDbPage); if( rc ) return rc; memset(pPage->aData, 0, 8); put4byte(&pPage1->aData[32], pPage->pgno); @@ -3922,28 +4176,32 @@ static int freePage(MemPage *pPage){ /* Other free pages already exist. Retrive the first trunk page ** of the freelist and find out how many leaves it has. */ MemPage *pTrunk; - rc = getPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk); + rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0); if( rc ) return rc; k = get4byte(&pTrunk->aData[4]); if( k>=pBt->usableSize/4 - 8 ){ /* The trunk is full. Turn the page being freed into a new ** trunk page with no leaves. */ - rc = sqlite3pager_write(pPage->aData); - if( rc ) return rc; - put4byte(pPage->aData, pTrunk->pgno); - put4byte(&pPage->aData[4], 0); - put4byte(&pPage1->aData[32], pPage->pgno); - TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", - pPage->pgno, pTrunk->pgno)); + rc = sqlite3PagerWrite(pPage->pDbPage); + if( rc==SQLITE_OK ){ + put4byte(pPage->aData, pTrunk->pgno); + put4byte(&pPage->aData[4], 0); + put4byte(&pPage1->aData[32], pPage->pgno); + TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", + pPage->pgno, pTrunk->pgno)); + } + }else if( k<0 ){ + rc = SQLITE_CORRUPT; }else{ /* Add the newly freed page as a leaf on the current trunk */ - rc = sqlite3pager_write(pTrunk->aData); - if( rc ) return rc; - put4byte(&pTrunk->aData[4], k+1); - put4byte(&pTrunk->aData[8+k*4], pPage->pgno); + rc = sqlite3PagerWrite(pTrunk->pDbPage); + if( rc==SQLITE_OK ){ + put4byte(&pTrunk->aData[4], k+1); + put4byte(&pTrunk->aData[8+k*4], pPage->pgno); #ifndef SQLITE_SECURE_DELETE - sqlite3pager_dont_write(pBt->pPager, pPage->pgno); + sqlite3PagerDontWrite(pPage->pDbPage); #endif + } TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno)); } releasePage(pTrunk); @@ -3959,22 +4217,28 @@ static int clearCell(MemPage *pPage, unsigned char *pCell){ CellInfo info; Pgno ovflPgno; int rc; + int nOvfl; + int ovflPageSize; - parseCellPtr(pPage, pCell, &info); + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); + sqlite3BtreeParseCellPtr(pPage, pCell, &info); if( info.iOverflow==0 ){ return SQLITE_OK; /* No overflow pages. Return without doing anything */ } ovflPgno = get4byte(&pCell[info.iOverflow]); - while( ovflPgno!=0 ){ + ovflPageSize = pBt->usableSize - 4; + nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize; + assert( ovflPgno==0 || nOvfl>0 ); + while( nOvfl-- ){ MemPage *pOvfl; - if( ovflPgno>sqlite3pager_pagecount(pBt->pPager) ){ + if( ovflPgno==0 || ovflPgno>sqlite3PagerPagecount(pBt->pPager) ){ return SQLITE_CORRUPT_BKPT; } - rc = getPage(pBt, ovflPgno, &pOvfl); + + rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno); if( rc ) return rc; - ovflPgno = get4byte(pOvfl->aData); rc = freePage(pOvfl); - sqlite3pager_unref(pOvfl->aData); + sqlite3PagerUnref(pOvfl->pDbPage); if( rc ) return rc; } return SQLITE_OK; @@ -3997,6 +4261,7 @@ static int fillInCell( unsigned char *pCell, /* Complete text of the cell */ const void *pKey, i64 nKey, /* The key */ const void *pData,int nData, /* The data */ + int nZero, /* Extra zero bytes to append to pData */ int *pnSize /* Write cell size here */ ){ int nPayload; @@ -4012,24 +4277,26 @@ static int fillInCell( int nHeader; CellInfo info; + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); + /* Fill in the header. */ nHeader = 0; if( !pPage->leaf ){ nHeader += 4; } if( pPage->hasData ){ - nHeader += putVarint(&pCell[nHeader], nData); + nHeader += putVarint(&pCell[nHeader], nData+nZero); }else{ - nData = 0; + nData = nZero = 0; } nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey); - parseCellPtr(pPage, pCell, &info); + sqlite3BtreeParseCellPtr(pPage, pCell, &info); assert( info.nHeader==nHeader ); assert( info.nKey==nKey ); - assert( info.nData==nData ); + assert( info.nData==nData+nZero ); /* Fill in the payload */ - nPayload = nData; + nPayload = nData + nZero; if( pPage->intKey ){ pSrc = pData; nSrc = nData; @@ -4046,23 +4313,42 @@ static int fillInCell( while( nPayload>0 ){ if( spaceLeft==0 ){ + int isExact = 0; #ifndef SQLITE_OMIT_AUTOVACUUM Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */ + if( pBt->autoVacuum ){ + do{ + pgnoOvfl++; + } while( + PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) + ); + if( pgnoOvfl>1 ){ + /* isExact = 1; */ + } + } #endif - rc = allocatePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0); + rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact); #ifndef SQLITE_OMIT_AUTOVACUUM /* If the database supports auto-vacuum, and the second or subsequent ** overflow page is being allocated, add an entry to the pointer-map - ** for that page now. The entry for the first overflow page will be - ** added later, by the insertCell() routine. + ** for that page now. + ** + ** If this is the first overflow page, then write a partial entry + ** to the pointer-map. If we write nothing to this pointer-map slot, + ** then the optimistic overflow chain processing in clearCell() + ** may misinterpret the uninitialised values and delete the + ** wrong pages from the database. */ - if( pBt->autoVacuum && pgnoPtrmap!=0 && rc==SQLITE_OK ){ - rc = ptrmapPut(pBt, pgnoOvfl, PTRMAP_OVERFLOW2, pgnoPtrmap); + if( pBt->autoVacuum && rc==SQLITE_OK ){ + u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1); + rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap); + if( rc ){ + releasePage(pOvfl); + } } #endif if( rc ){ releasePage(pToRelease); - /* clearCell(pPage, pCell); */ return rc; } put4byte(pPrior, pgnoOvfl); @@ -4075,9 +4361,13 @@ static int fillInCell( } n = nPayload; if( n>spaceLeft ) n = spaceLeft; - if( n>nSrc ) n = nSrc; - assert( pSrc ); - memcpy(pPayload, pSrc, n); + if( nSrc>0 ){ + if( n>nSrc ) n = nSrc; + assert( pSrc ); + memcpy(pPayload, pSrc, n); + }else{ + memset(pPayload, 0, n); + } nPayload -= n; pPayload += n; pSrc += n; @@ -4099,24 +4389,25 @@ static int fillInCell( */ static int reparentPage(BtShared *pBt, Pgno pgno, MemPage *pNewParent, int idx){ MemPage *pThis; - unsigned char *aData; + DbPage *pDbPage; + assert( sqlite3_mutex_held(pBt->mutex) ); assert( pNewParent!=0 ); if( pgno==0 ) return SQLITE_OK; assert( pBt->pPager!=0 ); - aData = sqlite3pager_lookup(pBt->pPager, pgno); - if( aData ){ - pThis = (MemPage*)&aData[pBt->pageSize]; - assert( pThis->aData==aData ); + pDbPage = sqlite3PagerLookup(pBt->pPager, pgno); + if( pDbPage ){ + pThis = (MemPage *)sqlite3PagerGetExtra(pDbPage); if( pThis->isInit ){ + assert( pThis->aData==sqlite3PagerGetData(pDbPage) ); if( pThis->pParent!=pNewParent ){ - if( pThis->pParent ) sqlite3pager_unref(pThis->pParent->aData); + if( pThis->pParent ) sqlite3PagerUnref(pThis->pParent->pDbPage); pThis->pParent = pNewParent; - sqlite3pager_ref(pNewParent->aData); + sqlite3PagerRef(pNewParent->pDbPage); } pThis->idxParent = idx; } - sqlite3pager_unref(aData); + sqlite3PagerUnref(pDbPage); } #ifndef SQLITE_OMIT_AUTOVACUUM @@ -4144,6 +4435,7 @@ static int reparentChildPages(MemPage *pPage){ BtShared *pBt = pPage->pBt; int rc = SQLITE_OK; + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); if( pPage->leaf ) return SQLITE_OK; for(i=0; i<pPage->nCell; i++){ @@ -4177,7 +4469,8 @@ static void dropCell(MemPage *pPage, int idx, int sz){ assert( idx>=0 && idx<pPage->nCell ); assert( sz==cellSize(pPage, idx) ); - assert( sqlite3pager_iswriteable(pPage->aData) ); + assert( sqlite3PagerIswriteable(pPage->pDbPage) ); + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); data = pPage->aData; ptr = &data[pPage->cellOffset + 2*idx]; pc = get2byte(ptr); @@ -4230,7 +4523,7 @@ static int insertCell( assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); assert( sz==cellSizePtr(pPage, pCell) ); - assert( sqlite3pager_iswriteable(pPage->aData) ); + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); if( pPage->nOverflow || sz+2>pPage->nFree ){ if( pTemp ){ memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip); @@ -4242,6 +4535,11 @@ static int insertCell( pPage->aOvfl[j].idx = i; pPage->nFree = 0; }else{ + int rc = sqlite3PagerWrite(pPage->pDbPage); + if( rc!=SQLITE_OK ){ + return rc; + } + assert( sqlite3PagerIswriteable(pPage->pDbPage) ); data = pPage->aData; hdr = pPage->hdrOffset; top = get2byte(&data[hdr+5]); @@ -4249,7 +4547,7 @@ static int insertCell( end = cellOffset + 2*pPage->nCell + 2; ins = cellOffset + 2*i; if( end > top - sz ){ - int rc = defragmentPage(pPage); + rc = defragmentPage(pPage); if( rc!=SQLITE_OK ) return rc; top = get2byte(&data[hdr+5]); assert( end + sz <= top ); @@ -4267,17 +4565,17 @@ static int insertCell( put2byte(&data[ins], idx); put2byte(&data[hdr+3], pPage->nCell); pPage->idxShift = 1; - pageIntegrity(pPage); #ifndef SQLITE_OMIT_AUTOVACUUM if( pPage->pBt->autoVacuum ){ /* The cell may contain a pointer to an overflow page. If so, write ** the entry for the overflow page into the pointer map. */ CellInfo info; - parseCellPtr(pPage, pCell, &info); + sqlite3BtreeParseCellPtr(pPage, pCell, &info); + assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){ Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); - int rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno); + rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno); if( rc!=SQLITE_OK ) return rc; } } @@ -4305,6 +4603,7 @@ static void assemblePage( u8 *data; /* Data for the page */ assert( pPage->nOverflow==0 ); + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); totalSize = 0; for(i=0; i<nCell; i++){ totalSize += aSize[i]; @@ -4379,10 +4678,12 @@ static int balance_quick(MemPage *pPage, MemPage *pParent){ int parentSize; /* Size of new divider cell */ u8 parentCell[64]; /* Space for the new divider cell */ + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); + /* Allocate a new page. Insert the overflow cell from pPage ** into it. Then remove the overflow cell from pPage. */ - rc = allocatePage(pBt, &pNew, &pgnoNew, 0, 0); + rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); if( rc!=SQLITE_OK ){ return rc; } @@ -4394,15 +4695,16 @@ static int balance_quick(MemPage *pPage, MemPage *pParent){ /* Set the parent of the newly allocated page to pParent. */ pNew->pParent = pParent; - sqlite3pager_ref(pParent->aData); + sqlite3PagerRef(pParent->pDbPage); /* pPage is currently the right-child of pParent. Change this ** so that the right-child is the new page allocated above and ** pPage is the next-to-right child. */ assert( pPage->nCell>0 ); - parseCellPtr(pPage, findCell(pPage, pPage->nCell-1), &info); - rc = fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, &parentSize); + pCell = findCell(pPage, pPage->nCell-1); + sqlite3BtreeParseCellPtr(pPage, pCell, &info); + rc = fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize); if( rc!=SQLITE_OK ){ return rc; } @@ -4421,11 +4723,11 @@ static int balance_quick(MemPage *pPage, MemPage *pParent){ */ if( pBt->autoVacuum ){ rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno); - if( rc!=SQLITE_OK ){ - return rc; + if( rc==SQLITE_OK ){ + rc = ptrmapPutOvfl(pNew, 0); } - rc = ptrmapPutOvfl(pNew, 0); if( rc!=SQLITE_OK ){ + releasePage(pNew); return rc; } } @@ -4440,19 +4742,6 @@ static int balance_quick(MemPage *pPage, MemPage *pParent){ #endif /* SQLITE_OMIT_QUICKBALANCE */ /* -** The ISAUTOVACUUM macro is used within balance_nonroot() to determine -** if the database supports auto-vacuum or not. Because it is used -** within an expression that is an argument to another macro -** (sqliteMallocRaw), it is not possible to use conditional compilation. -** So, this macro is defined instead. -*/ -#ifndef SQLITE_OMIT_AUTOVACUUM -#define ISAUTOVACUUM (pBt->autoVacuum) -#else -#define ISAUTOVACUUM 0 -#endif - -/* ** This routine redistributes Cells on pPage and up to NN*2 siblings ** of pPage so that all pages have about the same amount of free space. ** Usually NN siblings on either side of pPage is used in the balancing, @@ -4483,7 +4772,7 @@ static int balance_quick(MemPage *pPage, MemPage *pParent){ */ static int balance_nonroot(MemPage *pPage){ MemPage *pParent; /* The parent of pPage */ - BtShared *pBt; /* The whole database */ + BtShared *pBt; /* The whole database */ int nCell = 0; /* Number of cells in apCell[] */ int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ int nOld; /* Number of pages in apOld[] */ @@ -4515,15 +4804,17 @@ static int balance_nonroot(MemPage *pPage){ u8 *aFrom = 0; #endif + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); + /* ** Find the parent page. */ assert( pPage->isInit ); - assert( sqlite3pager_iswriteable(pPage->aData) ); + assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 ); pBt = pPage->pBt; pParent = pPage->pParent; assert( pParent ); - if( SQLITE_OK!=(rc = sqlite3pager_write(pParent->aData)) ){ + if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){ return rc; } TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno)); @@ -4553,6 +4844,10 @@ static int balance_nonroot(MemPage *pPage){ } #endif + if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){ + return rc; + } + /* ** Find the cell in the parent page whose left child points back ** to pPage. The "idx" variable is the index of that cell. If pPage @@ -4561,7 +4856,7 @@ static int balance_nonroot(MemPage *pPage){ if( pParent->idxShift ){ Pgno pgno; pgno = pPage->pgno; - assert( pgno==sqlite3pager_pagenumber(pPage->aData) ); + assert( pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); for(idx=0; idx<pParent->nCell; idx++){ if( get4byte(findCell(pParent, idx))==pgno ){ break; @@ -4578,7 +4873,7 @@ static int balance_nonroot(MemPage *pPage){ ** directly to balance_cleanup at any moment. */ nOld = nNew = 0; - sqlite3pager_ref(pParent->aData); + sqlite3PagerRef(pParent->pDbPage); /* ** Find sibling pages to pPage and the cells in pParent that divide @@ -4622,7 +4917,7 @@ static int balance_nonroot(MemPage *pPage){ /* ** Allocate space for memory structures */ - apCell = sqliteMallocRaw( + apCell = sqlite3_malloc( nMaxCells*sizeof(u8*) /* apCell */ + nMaxCells*sizeof(int) /* szCell */ + ROUND8(sizeof(MemPage))*NB /* aCopy */ @@ -4655,12 +4950,10 @@ static int balance_nonroot(MemPage *pPage){ ** process of being overwritten. */ for(i=0; i<nOld; i++){ - MemPage *p = apCopy[i] = (MemPage*)&aCopy[i][pBt->pageSize]; - p->aData = &((u8*)p)[-pBt->pageSize]; - memcpy(p->aData, apOld[i]->aData, pBt->pageSize + sizeof(MemPage)); - /* The memcpy() above changes the value of p->aData so we have to - ** set it again. */ - p->aData = &((u8*)p)[-pBt->pageSize]; + MemPage *p = apCopy[i] = (MemPage*)aCopy[i]; + memcpy(p, apOld[i], sizeof(MemPage)); + p->aData = (void*)&p[1]; + memcpy(p->aData, apOld[i]->aData, pBt->pageSize); } /* @@ -4736,6 +5029,10 @@ static int balance_nonroot(MemPage *pPage){ memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4); }else{ assert( leafCorrection==4 ); + if( szCell[nCell]<4 ){ + /* Do not allow any cells smaller than 4 bytes. */ + szCell[nCell] = 4; + } } nCell++; } @@ -4822,15 +5119,16 @@ static int balance_nonroot(MemPage *pPage){ pNew = apNew[i] = apOld[i]; pgnoNew[i] = pgnoOld[i]; apOld[i] = 0; - rc = sqlite3pager_write(pNew->aData); + rc = sqlite3PagerWrite(pNew->pDbPage); + nNew++; if( rc ) goto balance_cleanup; }else{ assert( i>0 ); - rc = allocatePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0); + rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0); if( rc ) goto balance_cleanup; apNew[i] = pNew; + nNew++; } - nNew++; zeroPage(pNew, pageFlags); } @@ -4938,16 +5236,16 @@ static int balance_nonroot(MemPage *pPage){ memcpy(&pNew->aData[8], pCell, 4); pTemp = 0; }else if( leafData ){ - /* If the tree is a leaf-data tree, and the siblings are leaves, + /* If the tree is a leaf-data tree, and the siblings are leaves, ** then there is no divider cell in apCell[]. Instead, the divider ** cell consists of the integer key for the right-most cell of ** the sibling-page assembled above only. */ CellInfo info; j--; - parseCellPtr(pNew, apCell[j], &info); + sqlite3BtreeParseCellPtr(pNew, apCell[j], &info); pCell = &aSpace[iSpace]; - fillInCell(pParent, pCell, 0, info.nKey, 0, 0, &sz); + fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz); iSpace += sz; assert( iSpace<=pBt->pageSize*5 ); pTemp = 0; @@ -4956,6 +5254,21 @@ static int balance_nonroot(MemPage *pPage){ pTemp = &aSpace[iSpace]; iSpace += sz; assert( iSpace<=pBt->pageSize*5 ); + /* Obscure case for non-leaf-data trees: If the cell at pCell was + ** previously stored on a leaf node, and its reported size was 4 + ** bytes, then it may actually be smaller than this + ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of + ** any cell). But it is important to pass the correct size to + ** insertCell(), so reparse the cell now. + ** + ** Note that this can never happen in an SQLite data file, as all + ** cells are at least 4 bytes. It only happens in b-trees used + ** to evaluate "IN (SELECT ...)" and similar clauses. + */ + if( szCell[j]==4 ){ + assert(leafCorrection==4); + sz = cellSizePtr(pParent, pCell); + } } rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4); if( rc!=SQLITE_OK ) goto balance_cleanup; @@ -5007,15 +5320,13 @@ static int balance_nonroot(MemPage *pPage){ ** But the parent page will always be initialized. */ assert( pParent->isInit ); - /* assert( pPage->isInit ); // No! pPage might have been added to freelist */ - /* pageIntegrity(pPage); // No! pPage might have been added to freelist */ rc = balance(pParent, 0); /* ** Cleanup before returning. */ balance_cleanup: - sqliteFree(apCell); + sqlite3_free(apCell); for(i=0; i<nOld; i++){ releasePage(apOld[i]); } @@ -5044,9 +5355,10 @@ static int balance_shallower(MemPage *pPage){ assert( pPage->pParent==0 ); assert( pPage->nCell==0 ); + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); pBt = pPage->pBt; mxCellPerPage = MX_CELL(pBt); - apCell = sqliteMallocRaw( mxCellPerPage*(sizeof(u8*)+sizeof(int)) ); + apCell = sqlite3_malloc( mxCellPerPage*(sizeof(u8*)+sizeof(int)) ); if( apCell==0 ) return SQLITE_NOMEM; szCell = (int*)&apCell[mxCellPerPage]; if( pPage->leaf ){ @@ -5067,11 +5379,11 @@ static int balance_shallower(MemPage *pPage){ */ pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]); assert( pgnoChild>0 ); - assert( pgnoChild<=sqlite3pager_pagecount(pPage->pBt->pPager) ); - rc = getPage(pPage->pBt, pgnoChild, &pChild); + assert( pgnoChild<=sqlite3PagerPagecount(pPage->pBt->pPager) ); + rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0); if( rc ) goto end_shallow_balance; if( pPage->pgno==1 ){ - rc = initPage(pChild, pPage); + rc = sqlite3BtreeInitPage(pChild, pPage); if( rc ) goto end_shallow_balance; assert( pChild->nOverflow==0 ); if( pChild->nFree>=100 ){ @@ -5098,7 +5410,7 @@ static int balance_shallower(MemPage *pPage){ memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize); pPage->isInit = 0; pPage->pParent = 0; - rc = initPage(pPage, 0); + rc = sqlite3BtreeInitPage(pPage, 0); assert( rc==SQLITE_OK ); freePage(pChild); TRACE(("BALANCE: transfer child %d into root %d\n", @@ -5117,11 +5429,10 @@ static int balance_shallower(MemPage *pPage){ } } #endif - if( rc!=SQLITE_OK ) goto end_shallow_balance; releasePage(pChild); } end_shallow_balance: - sqliteFree(apCell); + sqlite3_free(apCell); return rc; } @@ -5149,9 +5460,10 @@ static int balance_deeper(MemPage *pPage){ assert( pPage->pParent==0 ); assert( pPage->nOverflow>0 ); pBt = pPage->pBt; - rc = allocatePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0); + assert( sqlite3_mutex_held(pBt->mutex) ); + rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0); if( rc ) return rc; - assert( sqlite3pager_iswriteable(pChild->aData) ); + assert( sqlite3PagerIswriteable(pChild->pDbPage) ); usableSize = pBt->usableSize; data = pPage->aData; hdr = pPage->hdrOffset; @@ -5160,7 +5472,7 @@ static int balance_deeper(MemPage *pPage){ memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr); memcpy(&cdata[brk], &data[brk], usableSize-brk); assert( pChild->isInit==0 ); - rc = initPage(pChild, pPage); + rc = sqlite3BtreeInitPage(pChild, pPage); if( rc ) goto balancedeeper_out; memcpy(pChild->aOvfl, pPage->aOvfl, pPage->nOverflow*sizeof(pPage->aOvfl[0])); pChild->nOverflow = pPage->nOverflow; @@ -5197,8 +5509,10 @@ balancedeeper_out: */ static int balance(MemPage *pPage, int insert){ int rc = SQLITE_OK; + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); if( pPage->pParent==0 ){ - if( pPage->nOverflow>0 ){ + rc = sqlite3PagerWrite(pPage->pDbPage); + if( rc==SQLITE_OK && pPage->nOverflow>0 ){ rc = balance_deeper(pPage); } if( rc==SQLITE_OK && pPage->nCell==0 ){ @@ -5215,27 +5529,36 @@ static int balance(MemPage *pPage, int insert){ /* ** This routine checks all cursors that point to table pgnoRoot. -** If any of those cursors other than pExclude were opened with -** wrFlag==0 then this routine returns SQLITE_LOCKED. If all -** cursors that point to pgnoRoot were opened with wrFlag==1 -** then this routine returns SQLITE_OK. +** If any of those cursors were opened with wrFlag==0 in a different +** database connection (a database connection that shares the pager +** cache with the current connection) and that other connection +** is not in the ReadUncommmitted state, then this routine returns +** SQLITE_LOCKED. ** ** In addition to checking for read-locks (where a read-lock ** means a cursor opened with wrFlag==0) this routine also moves -** all cursors other than pExclude so that they are pointing to the -** first Cell on root page. This is necessary because an insert +** all write cursors so that they are pointing to the +** first Cell on the root page. This is necessary because an insert ** or delete might change the number of cells on a page or delete ** a page entirely and we do not want to leave any cursors ** pointing to non-existant pages or cells. */ -static int checkReadLocks(BtShared *pBt, Pgno pgnoRoot, BtCursor *pExclude){ +static int checkReadLocks(Btree *pBtree, Pgno pgnoRoot, BtCursor *pExclude){ BtCursor *p; + BtShared *pBt = pBtree->pBt; + sqlite3 *db = pBtree->db; + assert( sqlite3BtreeHoldsMutex(pBtree) ); for(p=pBt->pCursor; p; p=p->pNext){ - u32 flags = (p->pBtree->pSqlite ? p->pBtree->pSqlite->flags : 0); - if( p->pgnoRoot!=pgnoRoot || p==pExclude ) continue; - if( p->wrFlag==0 && flags&SQLITE_ReadUncommitted ) continue; - if( p->wrFlag==0 ) return SQLITE_LOCKED; - if( p->pPage->pgno!=p->pgnoRoot ){ + if( p==pExclude ) continue; + if( p->eState!=CURSOR_VALID ) continue; + if( p->pgnoRoot!=pgnoRoot ) continue; + if( p->wrFlag==0 ){ + sqlite3 *dbOther = p->pBtree->db; + if( dbOther==0 || + (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){ + return SQLITE_LOCKED; + } + }else if( p->pPage->pgno!=p->pgnoRoot ){ moveToRoot(p); } } @@ -5254,33 +5577,41 @@ static int checkReadLocks(BtShared *pBt, Pgno pgnoRoot, BtCursor *pExclude){ int sqlite3BtreeInsert( BtCursor *pCur, /* Insert data into the table of this cursor */ const void *pKey, i64 nKey, /* The key of the new record */ - const void *pData, int nData /* The data of the new record */ + const void *pData, int nData, /* The data of the new record */ + int nZero, /* Number of extra 0 bytes to append to data */ + int appendBias /* True if this is likely an append */ ){ int rc; int loc; int szNew; MemPage *pPage; - BtShared *pBt = pCur->pBtree->pBt; + Btree *p = pCur->pBtree; + BtShared *pBt = p->pBt; unsigned char *oldCell; unsigned char *newCell = 0; + assert( cursorHoldsMutex(pCur) ); if( pBt->inTransaction!=TRANS_WRITE ){ /* Must start a transaction before doing an insert */ - return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; + rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; + return rc; } assert( !pBt->readOnly ); if( !pCur->wrFlag ){ return SQLITE_PERM; /* Cursor not open for writing */ } - if( checkReadLocks(pBt, pCur->pgnoRoot, pCur) ){ + if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur) ){ return SQLITE_LOCKED; /* The table pCur points to has a read lock */ } + if( pCur->eState==CURSOR_FAULT ){ + return pCur->skip; + } /* Save the positions of any other cursors open on this table */ - restoreOrClearCursorPosition(pCur, 0); + clearCursorPosition(pCur); if( SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) || - SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, &loc)) + SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc)) ){ return rc; } @@ -5292,17 +5623,19 @@ int sqlite3BtreeInsert( pCur->pgnoRoot, nKey, nData, pPage->pgno, loc==0 ? "overwrite" : "new entry")); assert( pPage->isInit ); - rc = sqlite3pager_write(pPage->aData); - if( rc ) return rc; - newCell = sqliteMallocRaw( MX_CELL_SIZE(pBt) ); + newCell = sqlite3_malloc( MX_CELL_SIZE(pBt) ); if( newCell==0 ) return SQLITE_NOMEM; - rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, &szNew); + rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew); if( rc ) goto end_insert; assert( szNew==cellSizePtr(pPage, newCell) ); assert( szNew<=MX_CELL_SIZE(pBt) ); if( loc==0 && CURSOR_VALID==pCur->eState ){ int szOld; assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); + rc = sqlite3PagerWrite(pPage->pDbPage); + if( rc ){ + goto end_insert; + } oldCell = findCell(pPage, pCur->idx); if( !pPage->leaf ){ memcpy(newCell, oldCell, 4); @@ -5327,7 +5660,7 @@ int sqlite3BtreeInsert( moveToRoot(pCur); } end_insert: - sqliteFree(newCell); + sqlite3_free(newCell); return rc; } @@ -5340,38 +5673,44 @@ int sqlite3BtreeDelete(BtCursor *pCur){ unsigned char *pCell; int rc; Pgno pgnoChild = 0; - BtShared *pBt = pCur->pBtree->pBt; + Btree *p = pCur->pBtree; + BtShared *pBt = p->pBt; + assert( cursorHoldsMutex(pCur) ); assert( pPage->isInit ); if( pBt->inTransaction!=TRANS_WRITE ){ /* Must start a transaction before doing a delete */ - return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; + rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; + return rc; } assert( !pBt->readOnly ); + if( pCur->eState==CURSOR_FAULT ){ + return pCur->skip; + } if( pCur->idx >= pPage->nCell ){ return SQLITE_ERROR; /* The cursor is not pointing to anything */ } if( !pCur->wrFlag ){ return SQLITE_PERM; /* Did not open this cursor for writing */ } - if( checkReadLocks(pBt, pCur->pgnoRoot, pCur) ){ + if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur) ){ return SQLITE_LOCKED; /* The table pCur points to has a read lock */ } /* Restore the current cursor position (a no-op if the cursor is not in ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors - ** open on the same table. Then call sqlite3pager_write() on the page + ** open on the same table. Then call sqlite3PagerWrite() on the page ** that the entry will be deleted from. */ if( - (rc = restoreOrClearCursorPosition(pCur, 1))!=0 || + (rc = restoreOrClearCursorPosition(pCur))!=0 || (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 || - (rc = sqlite3pager_write(pPage->aData))!=0 + (rc = sqlite3PagerWrite(pPage->pDbPage))!=0 ){ return rc; } - /* Locate the cell within it's page and leave pCell pointing to the + /* Locate the cell within its page and leave pCell pointing to the ** data. The clearCell() call frees any overflow pages associated with the ** cell. The cell itself is still intact. */ @@ -5380,7 +5719,9 @@ int sqlite3BtreeDelete(BtCursor *pCur){ pgnoChild = get4byte(pCell); } rc = clearCell(pPage, pCell); - if( rc ) return rc; + if( rc ){ + return rc; + } if( !pPage->leaf ){ /* @@ -5392,47 +5733,40 @@ int sqlite3BtreeDelete(BtCursor *pCur){ */ BtCursor leafCur; unsigned char *pNext; - int szNext; /* The compiler warning is wrong: szNext is always - ** initialized before use. Adding an extra initialization - ** to silence the compiler slows down the code. */ int notUsed; unsigned char *tempCell = 0; assert( !pPage->leafData ); - getTempCursor(pCur, &leafCur); + sqlite3BtreeGetTempCursor(pCur, &leafCur); rc = sqlite3BtreeNext(&leafCur, ¬Used); - if( rc!=SQLITE_OK ){ - if( rc!=SQLITE_NOMEM ){ - rc = SQLITE_CORRUPT_BKPT; - } - } if( rc==SQLITE_OK ){ - rc = sqlite3pager_write(leafCur.pPage->aData); + rc = sqlite3PagerWrite(leafCur.pPage->pDbPage); } if( rc==SQLITE_OK ){ + int szNext; TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n", pCur->pgnoRoot, pPage->pgno, leafCur.pPage->pgno)); dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell)); pNext = findCell(leafCur.pPage, leafCur.idx); szNext = cellSizePtr(leafCur.pPage, pNext); assert( MX_CELL_SIZE(pBt)>=szNext+4 ); - tempCell = sqliteMallocRaw( MX_CELL_SIZE(pBt) ); + tempCell = sqlite3_malloc( MX_CELL_SIZE(pBt) ); if( tempCell==0 ){ rc = SQLITE_NOMEM; } + if( rc==SQLITE_OK ){ + rc = insertCell(pPage, pCur->idx, pNext-4, szNext+4, tempCell, 0); + } + if( rc==SQLITE_OK ){ + put4byte(findOverflowCell(pPage, pCur->idx), pgnoChild); + rc = balance(pPage, 0); + } + if( rc==SQLITE_OK ){ + dropCell(leafCur.pPage, leafCur.idx, szNext); + rc = balance(leafCur.pPage, 0); + } } - if( rc==SQLITE_OK ){ - rc = insertCell(pPage, pCur->idx, pNext-4, szNext+4, tempCell, 0); - } - if( rc==SQLITE_OK ){ - put4byte(findOverflowCell(pPage, pCur->idx), pgnoChild); - rc = balance(pPage, 0); - } - if( rc==SQLITE_OK ){ - dropCell(leafCur.pPage, leafCur.idx, szNext); - rc = balance(leafCur.pPage, 0); - } - sqliteFree(tempCell); - releaseTempCursor(&leafCur); + sqlite3_free(tempCell); + sqlite3BtreeReleaseTempCursor(&leafCur); }else{ TRACE(("DELETE: table=%d delete from leaf %d\n", pCur->pgnoRoot, pPage->pgno)); @@ -5456,46 +5790,51 @@ int sqlite3BtreeDelete(BtCursor *pCur){ ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys ** BTREE_ZERODATA Used for SQL indices */ -int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){ +static int btreeCreateTable(Btree *p, int *piTable, int flags){ BtShared *pBt = p->pBt; MemPage *pRoot; Pgno pgnoRoot; int rc; + + assert( sqlite3BtreeHoldsMutex(p) ); if( pBt->inTransaction!=TRANS_WRITE ){ /* Must start a transaction first */ - return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; + rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; + return rc; } assert( !pBt->readOnly ); - /* It is illegal to create a table if any cursors are open on the - ** database. This is because in auto-vacuum mode the backend may - ** need to move a database page to make room for the new root-page. - ** If an open cursor was using the page a problem would occur. - */ - if( pBt->pCursor ){ - return SQLITE_LOCKED; - } - #ifdef SQLITE_OMIT_AUTOVACUUM - rc = allocatePage(pBt, &pRoot, &pgnoRoot, 1, 0); - if( rc ) return rc; + rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); + if( rc ){ + return rc; + } #else if( pBt->autoVacuum ){ Pgno pgnoMove; /* Move a page here to make room for the root-page */ MemPage *pPageMove; /* The page to move to. */ + /* Creating a new table may probably require moving an existing database + ** to make room for the new tables root page. In case this page turns + ** out to be an overflow page, delete all overflow page-map caches + ** held by open cursors. + */ + invalidateAllOverflowCache(pBt); + /* Read the value of meta[3] from the database to determine where the ** root page of the new table should go. meta[3] is the largest root-page ** created so far, so the new root-page is (meta[3]+1). */ rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot); - if( rc!=SQLITE_OK ) return rc; + if( rc!=SQLITE_OK ){ + return rc; + } pgnoRoot++; /* The new root-page may not be allocated on a pointer-map page, or the ** PENDING_BYTE page. */ - if( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) || + while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) || pgnoRoot==PENDING_BYTE_PAGE(pBt) ){ pgnoRoot++; } @@ -5505,17 +5844,25 @@ int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){ ** be moved to the allocated page (unless the allocated page happens ** to reside at pgnoRoot). */ - rc = allocatePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1); + rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1); if( rc!=SQLITE_OK ){ return rc; } if( pgnoMove!=pgnoRoot ){ + /* pgnoRoot is the page that will be used for the root-page of + ** the new table (assuming an error did not occur). But we were + ** allocated pgnoMove. If required (i.e. if it was not allocated + ** by extending the file), the current page at position pgnoMove + ** is already journaled. + */ u8 eType; Pgno iPtrPage; releasePage(pPageMove); - rc = getPage(pBt, pgnoRoot, &pRoot); + + /* Move the page currently at pgnoRoot to pgnoMove. */ + rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0); if( rc!=SQLITE_OK ){ return rc; } @@ -5526,21 +5873,23 @@ int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){ } assert( eType!=PTRMAP_ROOTPAGE ); assert( eType!=PTRMAP_FREEPAGE ); - rc = sqlite3pager_write(pRoot->aData); + rc = sqlite3PagerWrite(pRoot->pDbPage); if( rc!=SQLITE_OK ){ releasePage(pRoot); return rc; } rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove); releasePage(pRoot); + + /* Obtain the page at pgnoRoot */ if( rc!=SQLITE_OK ){ return rc; } - rc = getPage(pBt, pgnoRoot, &pRoot); + rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0); if( rc!=SQLITE_OK ){ return rc; } - rc = sqlite3pager_write(pRoot->aData); + rc = sqlite3PagerWrite(pRoot->pDbPage); if( rc!=SQLITE_OK ){ releasePage(pRoot); return rc; @@ -5562,16 +5911,24 @@ int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){ } }else{ - rc = allocatePage(pBt, &pRoot, &pgnoRoot, 1, 0); + rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); if( rc ) return rc; } #endif - assert( sqlite3pager_iswriteable(pRoot->aData) ); + assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); zeroPage(pRoot, flags | PTF_LEAF); - sqlite3pager_unref(pRoot->aData); + sqlite3PagerUnref(pRoot->pDbPage); *piTable = (int)pgnoRoot; return SQLITE_OK; } +int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){ + int rc; + sqlite3BtreeEnter(p); + p->pBt->db = p->db; + rc = btreeCreateTable(p, piTable, flags); + sqlite3BtreeLeave(p); + return rc; +} /* ** Erase the given database page and all its children. Return @@ -5588,14 +5945,13 @@ static int clearDatabasePage( unsigned char *pCell; int i; - if( pgno>sqlite3pager_pagecount(pBt->pPager) ){ + assert( sqlite3_mutex_held(pBt->mutex) ); + if( pgno>sqlite3PagerPagecount(pBt->pPager) ){ return SQLITE_CORRUPT_BKPT; } rc = getAndInitPage(pBt, pgno, &pPage, pParent); if( rc ) goto cleardatabasepage_out; - rc = sqlite3pager_write(pPage->aData); - if( rc ) goto cleardatabasepage_out; for(i=0; i<pPage->nCell; i++){ pCell = findCell(pPage, i); if( !pPage->leaf ){ @@ -5611,7 +5967,7 @@ static int clearDatabasePage( } if( freePageFlag ){ rc = freePage(pPage); - }else{ + }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){ zeroPage(pPage, pPage->aData[0] | PTF_LEAF); } @@ -5631,33 +5987,20 @@ cleardatabasepage_out: */ int sqlite3BtreeClearTable(Btree *p, int iTable){ int rc; - BtCursor *pCur; BtShared *pBt = p->pBt; - sqlite3 *db = p->pSqlite; + sqlite3BtreeEnter(p); + pBt->db = p->db; if( p->inTrans!=TRANS_WRITE ){ - return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; - } - - /* If this connection is not in read-uncommitted mode and currently has - ** a read-cursor open on the table being cleared, return SQLITE_LOCKED. - */ - if( 0==db || 0==(db->flags&SQLITE_ReadUncommitted) ){ - for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ - if( pCur->pBtree==p && pCur->pgnoRoot==(Pgno)iTable ){ - if( 0==pCur->wrFlag ){ - return SQLITE_LOCKED; - } - moveToRoot(pCur); - } - } - } - - /* Save the position of all cursors open on this table */ - if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){ - return rc; + rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; + }else if( (rc = checkReadLocks(p, iTable, 0))!=SQLITE_OK ){ + /* nothing to do */ + }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){ + /* nothing to do */ + }else{ + rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0); } - - return clearDatabasePage(pBt, (Pgno)iTable, 0, 0); + sqlite3BtreeLeave(p); + return rc; } /* @@ -5680,11 +6023,12 @@ int sqlite3BtreeClearTable(Btree *p, int iTable){ ** The last root page is recorded in meta[3] and the value of ** meta[3] is updated by this procedure. */ -int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ +static int btreeDropTable(Btree *p, int iTable, int *piMoved){ int rc; MemPage *pPage = 0; BtShared *pBt = p->pBt; + assert( sqlite3BtreeHoldsMutex(p) ); if( p->inTrans!=TRANS_WRITE ){ return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; } @@ -5699,7 +6043,7 @@ int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ return SQLITE_LOCKED; } - rc = getPage(pBt, (Pgno)iTable, &pPage); + rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0); if( rc ) return rc; rc = sqlite3BtreeClearTable(p, iTable); if( rc ){ @@ -5738,7 +6082,7 @@ int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ */ MemPage *pMove; releasePage(pPage); - rc = getPage(pBt, maxRootPgno, &pMove); + rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0); if( rc!=SQLITE_OK ){ return rc; } @@ -5747,7 +6091,7 @@ int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ if( rc!=SQLITE_OK ){ return rc; } - rc = getPage(pBt, maxRootPgno, &pMove); + rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0); if( rc!=SQLITE_OK ){ return rc; } @@ -5786,6 +6130,14 @@ int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ } return rc; } +int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ + int rc; + sqlite3BtreeEnter(p); + p->pBt->db = p->db; + rc = btreeDropTable(p, iTable, piMoved); + sqlite3BtreeLeave(p); + return rc; +} /* @@ -5799,10 +6151,14 @@ int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ ** free pages is not visible. So Cookie[0] is the same as Meta[1]. */ int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ + DbPage *pDbPage; int rc; unsigned char *pP1; BtShared *pBt = p->pBt; + sqlite3BtreeEnter(p); + pBt->db = p->db; + /* Reading a meta-data value requires a read-lock on page 1 (and hence ** the sqlite_master table. We grab this lock regardless of whether or ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page @@ -5810,14 +6166,19 @@ int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ */ rc = queryTableLock(p, 1, READ_LOCK); if( rc!=SQLITE_OK ){ + sqlite3BtreeLeave(p); return rc; } assert( idx>=0 && idx<=15 ); - rc = sqlite3pager_get(pBt->pPager, 1, (void**)&pP1); - if( rc ) return rc; + rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage); + if( rc ){ + sqlite3BtreeLeave(p); + return rc; + } + pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage); *pMeta = get4byte(&pP1[36 + idx*4]); - sqlite3pager_unref(pP1); + sqlite3PagerUnref(pDbPage); /* If autovacuumed is disabled in this build but we are trying to ** access an autovacuumed database, then make the database readonly. @@ -5828,6 +6189,7 @@ int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ /* Grab the read-lock on page 1. */ rc = lockTable(p, 1, READ_LOCK); + sqlite3BtreeLeave(p); return rc; } @@ -5840,15 +6202,27 @@ int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){ unsigned char *pP1; int rc; assert( idx>=1 && idx<=15 ); + sqlite3BtreeEnter(p); + pBt->db = p->db; if( p->inTrans!=TRANS_WRITE ){ - return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; + rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; + }else{ + assert( pBt->pPage1!=0 ); + pP1 = pBt->pPage1->aData; + rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); + if( rc==SQLITE_OK ){ + put4byte(&pP1[36 + idx*4], iMeta); +#ifndef SQLITE_OMIT_AUTOVACUUM + if( idx==7 ){ + assert( pBt->autoVacuum || iMeta==0 ); + assert( iMeta==0 || iMeta==1 ); + pBt->incrVacuum = iMeta; + } +#endif + } } - assert( pBt->pPage1!=0 ); - pP1 = pBt->pPage1->aData; - rc = sqlite3pager_write(pP1); - if( rc ) return rc; - put4byte(&pP1[36 + idx*4], iMeta); - return SQLITE_OK; + sqlite3BtreeLeave(p); + return rc; } /* @@ -5859,185 +6233,14 @@ int sqlite3BtreeFlags(BtCursor *pCur){ /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call ** restoreOrClearCursorPosition() here. */ - MemPage *pPage = pCur->pPage; - return pPage ? pPage->aData[pPage->hdrOffset] : 0; -} - -#ifdef SQLITE_DEBUG -/* -** Print a disassembly of the given page on standard output. This routine -** is used for debugging and testing only. -*/ -static int btreePageDump(BtShared *pBt, int pgno, int recursive, MemPage *pParent){ - int rc; MemPage *pPage; - int i, j, c; - int nFree; - u16 idx; - int hdr; - int nCell; - int isInit; - unsigned char *data; - char range[20]; - unsigned char payload[20]; - - rc = getPage(pBt, (Pgno)pgno, &pPage); - isInit = pPage->isInit; - if( pPage->isInit==0 ){ - initPage(pPage, pParent); - } - if( rc ){ - return rc; - } - hdr = pPage->hdrOffset; - data = pPage->aData; - c = data[hdr]; - pPage->intKey = (c & (PTF_INTKEY|PTF_LEAFDATA))!=0; - pPage->zeroData = (c & PTF_ZERODATA)!=0; - pPage->leafData = (c & PTF_LEAFDATA)!=0; - pPage->leaf = (c & PTF_LEAF)!=0; - pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData)); - nCell = get2byte(&data[hdr+3]); - sqlite3DebugPrintf("PAGE %d: flags=0x%02x frag=%d parent=%d\n", pgno, - data[hdr], data[hdr+7], - (pPage->isInit && pPage->pParent) ? pPage->pParent->pgno : 0); - assert( hdr == (pgno==1 ? 100 : 0) ); - idx = hdr + 12 - pPage->leaf*4; - for(i=0; i<nCell; i++){ - CellInfo info; - Pgno child; - unsigned char *pCell; - int sz; - int addr; - - addr = get2byte(&data[idx + 2*i]); - pCell = &data[addr]; - parseCellPtr(pPage, pCell, &info); - sz = info.nSize; - sprintf(range,"%d..%d", addr, addr+sz-1); - if( pPage->leaf ){ - child = 0; - }else{ - child = get4byte(pCell); - } - sz = info.nData; - if( !pPage->intKey ) sz += info.nKey; - if( sz>sizeof(payload)-1 ) sz = sizeof(payload)-1; - memcpy(payload, &pCell[info.nHeader], sz); - for(j=0; j<sz; j++){ - if( payload[j]<0x20 || payload[j]>0x7f ) payload[j] = '.'; - } - payload[sz] = 0; - sqlite3DebugPrintf( - "cell %2d: i=%-10s chld=%-4d nk=%-4lld nd=%-4d payload=%s\n", - i, range, child, info.nKey, info.nData, payload - ); - } - if( !pPage->leaf ){ - sqlite3DebugPrintf("right_child: %d\n", get4byte(&data[hdr+8])); - } - nFree = 0; - i = 0; - idx = get2byte(&data[hdr+1]); - while( idx>0 && idx<pPage->pBt->usableSize ){ - int sz = get2byte(&data[idx+2]); - sprintf(range,"%d..%d", idx, idx+sz-1); - nFree += sz; - sqlite3DebugPrintf("freeblock %2d: i=%-10s size=%-4d total=%d\n", - i, range, sz, nFree); - idx = get2byte(&data[idx]); - i++; - } - if( idx!=0 ){ - sqlite3DebugPrintf("ERROR: next freeblock index out of range: %d\n", idx); - } - if( recursive && !pPage->leaf ){ - for(i=0; i<nCell; i++){ - unsigned char *pCell = findCell(pPage, i); - btreePageDump(pBt, get4byte(pCell), 1, pPage); - idx = get2byte(pCell); - } - btreePageDump(pBt, get4byte(&data[hdr+8]), 1, pPage); - } - pPage->isInit = isInit; - sqlite3pager_unref(data); - fflush(stdout); - return SQLITE_OK; -} -int sqlite3BtreePageDump(Btree *p, int pgno, int recursive){ - return btreePageDump(p->pBt, pgno, recursive, 0); + restoreOrClearCursorPosition(pCur); + pPage = pCur->pPage; + assert( cursorHoldsMutex(pCur) ); + assert( pPage->pBt==pCur->pBt ); + return pPage ? pPage->aData[pPage->hdrOffset] : 0; } -#endif - -#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) -/* -** Fill aResult[] with information about the entry and page that the -** cursor is pointing to. -** -** aResult[0] = The page number -** aResult[1] = The entry number -** aResult[2] = Total number of entries on this page -** aResult[3] = Cell size (local payload + header) -** aResult[4] = Number of free bytes on this page -** aResult[5] = Number of free blocks on the page -** aResult[6] = Total payload size (local + overflow) -** aResult[7] = Header size in bytes -** aResult[8] = Local payload size -** aResult[9] = Parent page number -** -** This routine is used for testing and debugging only. -*/ -int sqlite3BtreeCursorInfo(BtCursor *pCur, int *aResult, int upCnt){ - int cnt, idx; - MemPage *pPage = pCur->pPage; - BtCursor tmpCur; - - int rc = restoreOrClearCursorPosition(pCur, 1); - if( rc!=SQLITE_OK ){ - return rc; - } - pageIntegrity(pPage); - assert( pPage->isInit ); - getTempCursor(pCur, &tmpCur); - while( upCnt-- ){ - moveToParent(&tmpCur); - } - pPage = tmpCur.pPage; - pageIntegrity(pPage); - aResult[0] = sqlite3pager_pagenumber(pPage->aData); - assert( aResult[0]==pPage->pgno ); - aResult[1] = tmpCur.idx; - aResult[2] = pPage->nCell; - if( tmpCur.idx>=0 && tmpCur.idx<pPage->nCell ){ - getCellInfo(&tmpCur); - aResult[3] = tmpCur.info.nSize; - aResult[6] = tmpCur.info.nData; - aResult[7] = tmpCur.info.nHeader; - aResult[8] = tmpCur.info.nLocal; - }else{ - aResult[3] = 0; - aResult[6] = 0; - aResult[7] = 0; - aResult[8] = 0; - } - aResult[4] = pPage->nFree; - cnt = 0; - idx = get2byte(&pPage->aData[pPage->hdrOffset+1]); - while( idx>0 && idx<pPage->pBt->usableSize ){ - cnt++; - idx = get2byte(&pPage->aData[idx]); - } - aResult[5] = cnt; - if( pPage->pParent==0 || isRootPage(pPage) ){ - aResult[9] = 0; - }else{ - aResult[9] = pPage->pParent->pgno; - } - releaseTempCursor(&tmpCur); - return SQLITE_OK; -} -#endif /* ** Return the pager associated with a BTree. This routine is used for @@ -6047,19 +6250,6 @@ Pager *sqlite3BtreePager(Btree *p){ return p->pBt->pPager; } -/* -** This structure is passed around through all the sanity checking routines -** in order to keep track of some global state information. -*/ -typedef struct IntegrityCk IntegrityCk; -struct IntegrityCk { - BtShared *pBt; /* The tree being checked out */ - Pager *pPager; /* The associated pager. Also accessible by pBt->pPager */ - int nPage; /* Number of pages in the database */ - int *anRef; /* Number of times each page is referenced */ - char *zErrMsg; /* An error message. NULL of no errors seen. */ -}; - #ifndef SQLITE_OMIT_INTEGRITY_CHECK /* ** Append a message to the error message string. @@ -6072,19 +6262,22 @@ static void checkAppendMsg( ){ va_list ap; char *zMsg2; + if( !pCheck->mxErr ) return; + pCheck->mxErr--; + pCheck->nErr++; va_start(ap, zFormat); - zMsg2 = sqlite3VMPrintf(zFormat, ap); + zMsg2 = sqlite3VMPrintf(0, zFormat, ap); va_end(ap); if( zMsg1==0 ) zMsg1 = ""; if( pCheck->zErrMsg ){ char *zOld = pCheck->zErrMsg; pCheck->zErrMsg = 0; sqlite3SetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0); - sqliteFree(zOld); + sqlite3_free(zOld); }else{ sqlite3SetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0); } - sqliteFree(zMsg2); + sqlite3_free(zMsg2); } #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ @@ -6155,8 +6348,9 @@ static void checkList( int i; int expected = N; int iFirst = iPage; - while( N-- > 0 ){ - unsigned char *pOvfl; + while( N-- > 0 && pCheck->mxErr ){ + DbPage *pOvflPage; + unsigned char *pOvflData; if( iPage<1 ){ checkAppendMsg(pCheck, zContext, "%d of %d pages missing from overflow list starting at %d", @@ -6164,12 +6358,13 @@ static void checkList( break; } if( checkRef(pCheck, iPage, zContext) ) break; - if( sqlite3pager_get(pCheck->pPager, (Pgno)iPage, (void**)&pOvfl) ){ + if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){ checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage); break; } + pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage); if( isFreeList ){ - int n = get4byte(&pOvfl[4]); + int n = get4byte(&pOvflData[4]); #ifndef SQLITE_OMIT_AUTOVACUUM if( pCheck->pBt->autoVacuum ){ checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext); @@ -6181,7 +6376,7 @@ static void checkList( N--; }else{ for(i=0; i<n; i++){ - Pgno iFreePage = get4byte(&pOvfl[8+i*4]); + Pgno iFreePage = get4byte(&pOvflData[8+i*4]); #ifndef SQLITE_OMIT_AUTOVACUUM if( pCheck->pBt->autoVacuum ){ checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext); @@ -6199,13 +6394,13 @@ static void checkList( ** the following page matches iPage. */ if( pCheck->pBt->autoVacuum && N>0 ){ - i = get4byte(pOvfl); + i = get4byte(pOvflData); checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext); } } #endif - iPage = get4byte(pOvfl); - sqlite3pager_unref(pOvfl); + iPage = get4byte(pOvflData); + sqlite3PagerUnref(pOvflPage); } } #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ @@ -6245,7 +6440,7 @@ static int checkTreePage( char zContext[100]; char *hit; - sprintf(zContext, "Page %d: ", iPage); + sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage); /* Check that the page exists */ @@ -6253,13 +6448,14 @@ static int checkTreePage( usableSize = pBt->usableSize; if( iPage==0 ) return 0; if( checkRef(pCheck, iPage, zParentContext) ) return 0; - if( (rc = getPage(pBt, (Pgno)iPage, &pPage))!=0 ){ + if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){ checkAppendMsg(pCheck, zContext, "unable to get the page. error code=%d", rc); return 0; } - if( (rc = initPage(pPage, pParent))!=0 ){ - checkAppendMsg(pCheck, zContext, "initPage() returns error code %d", rc); + if( (rc = sqlite3BtreeInitPage(pPage, pParent))!=0 ){ + checkAppendMsg(pCheck, zContext, + "sqlite3BtreeInitPage() returns error code %d", rc); releasePage(pPage); return 0; } @@ -6267,18 +6463,20 @@ static int checkTreePage( /* Check out all the cells. */ depth = 0; - for(i=0; i<pPage->nCell; i++){ + for(i=0; i<pPage->nCell && pCheck->mxErr; i++){ u8 *pCell; int sz; CellInfo info; /* Check payload overflow pages */ - sprintf(zContext, "On tree page %d cell %d: ", iPage, i); + sqlite3_snprintf(sizeof(zContext), zContext, + "On tree page %d cell %d: ", iPage, i); pCell = findCell(pPage,i); - parseCellPtr(pPage, pCell, &info); + sqlite3BtreeParseCellPtr(pPage, pCell, &info); sz = info.nData; if( !pPage->intKey ) sz += info.nKey; + assert( sz==info.nPayload ); if( sz>info.nLocal ){ int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4); Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); @@ -6308,7 +6506,8 @@ static int checkTreePage( } if( !pPage->leaf ){ pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); - sprintf(zContext, "On page %d at right child: ", iPage); + sqlite3_snprintf(sizeof(zContext), zContext, + "On page %d at right child: ", iPage); #ifndef SQLITE_OMIT_AUTOVACUUM if( pBt->autoVacuum ){ checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0); @@ -6321,7 +6520,7 @@ static int checkTreePage( */ data = pPage->aData; hdr = pPage->hdrOffset; - hit = sqliteMalloc( usableSize ); + hit = sqlite3MallocZero( usableSize ); if( hit ){ memset(hit, 1, get2byte(&data[hdr+5])); nCell = get2byte(&data[hdr+3]); @@ -6364,7 +6563,7 @@ static int checkTreePage( cnt, data[hdr+7], iPage); } } - sqliteFree(hit); + sqlite3_free(hit); releasePage(pPage); return depth+1; @@ -6382,27 +6581,47 @@ static int checkTreePage( ** and a pointer to that error message is returned. The calling function ** is responsible for freeing the error message when it is done. */ -char *sqlite3BtreeIntegrityCheck(Btree *p, int *aRoot, int nRoot){ +char *sqlite3BtreeIntegrityCheck( + Btree *p, /* The btree to be checked */ + int *aRoot, /* An array of root pages numbers for individual trees */ + int nRoot, /* Number of entries in aRoot[] */ + int mxErr, /* Stop reporting errors after this many */ + int *pnErr /* Write number of errors seen to this variable */ +){ int i; int nRef; IntegrityCk sCheck; BtShared *pBt = p->pBt; - nRef = sqlite3pager_refcount(pBt->pPager); + sqlite3BtreeEnter(p); + pBt->db = p->db; + nRef = sqlite3PagerRefcount(pBt->pPager); if( lockBtreeWithRetry(p)!=SQLITE_OK ){ - return sqliteStrDup("Unable to acquire a read lock on the database"); + sqlite3BtreeLeave(p); + return sqlite3StrDup("Unable to acquire a read lock on the database"); } sCheck.pBt = pBt; sCheck.pPager = pBt->pPager; - sCheck.nPage = sqlite3pager_pagecount(sCheck.pPager); + sCheck.nPage = sqlite3PagerPagecount(sCheck.pPager); + sCheck.mxErr = mxErr; + sCheck.nErr = 0; + *pnErr = 0; +#ifndef SQLITE_OMIT_AUTOVACUUM + if( pBt->nTrunc!=0 ){ + sCheck.nPage = pBt->nTrunc; + } +#endif if( sCheck.nPage==0 ){ unlockBtreeIfUnused(pBt); + sqlite3BtreeLeave(p); return 0; } - sCheck.anRef = sqliteMallocRaw( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) ); + sCheck.anRef = sqlite3_malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) ); if( !sCheck.anRef ){ unlockBtreeIfUnused(pBt); - return sqlite3MPrintf("Unable to malloc %d bytes", + *pnErr = 1; + sqlite3BtreeLeave(p); + return sqlite3MPrintf(p->db, "Unable to malloc %d bytes", (sCheck.nPage+1)*sizeof(sCheck.anRef[0])); } for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; } @@ -6419,7 +6638,7 @@ char *sqlite3BtreeIntegrityCheck(Btree *p, int *aRoot, int nRoot){ /* Check all the tables. */ - for(i=0; i<nRoot; i++){ + for(i=0; i<nRoot && sCheck.mxErr; i++){ if( aRoot[i]==0 ) continue; #ifndef SQLITE_OMIT_AUTOVACUUM if( pBt->autoVacuum && aRoot[i]>1 ){ @@ -6431,7 +6650,7 @@ char *sqlite3BtreeIntegrityCheck(Btree *p, int *aRoot, int nRoot){ /* Make sure every page in the file is referenced */ - for(i=1; i<=sCheck.nPage; i++){ + for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){ #ifdef SQLITE_OMIT_AUTOVACUUM if( sCheck.anRef[i]==0 ){ checkAppendMsg(&sCheck, 0, "Page %d is never used", i); @@ -6454,44 +6673,55 @@ char *sqlite3BtreeIntegrityCheck(Btree *p, int *aRoot, int nRoot){ /* Make sure this analysis did not leave any unref() pages */ unlockBtreeIfUnused(pBt); - if( nRef != sqlite3pager_refcount(pBt->pPager) ){ + if( nRef != sqlite3PagerRefcount(pBt->pPager) ){ checkAppendMsg(&sCheck, 0, "Outstanding page count goes from %d to %d during this analysis", - nRef, sqlite3pager_refcount(pBt->pPager) + nRef, sqlite3PagerRefcount(pBt->pPager) ); } /* Clean up and report errors. */ - sqliteFree(sCheck.anRef); + sqlite3BtreeLeave(p); + sqlite3_free(sCheck.anRef); + *pnErr = sCheck.nErr; return sCheck.zErrMsg; } #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ /* ** Return the full pathname of the underlying database file. +** +** The pager filename is invariant as long as the pager is +** open so it is safe to access without the BtShared mutex. */ const char *sqlite3BtreeGetFilename(Btree *p){ assert( p->pBt->pPager!=0 ); - return sqlite3pager_filename(p->pBt->pPager); + return sqlite3PagerFilename(p->pBt->pPager); } /* ** Return the pathname of the directory that contains the database file. +** +** The pager directory name is invariant as long as the pager is +** open so it is safe to access without the BtShared mutex. */ const char *sqlite3BtreeGetDirname(Btree *p){ assert( p->pBt->pPager!=0 ); - return sqlite3pager_dirname(p->pBt->pPager); + return sqlite3PagerDirname(p->pBt->pPager); } /* ** Return the pathname of the journal file for this database. The return ** value of this routine is the same regardless of whether the journal file ** has been created or not. +** +** The pager journal filename is invariant as long as the pager is +** open so it is safe to access without the BtShared mutex. */ const char *sqlite3BtreeGetJournalname(Btree *p){ assert( p->pBt->pPager!=0 ); - return sqlite3pager_journalname(p->pBt->pPager); + return sqlite3PagerJournalname(p->pBt->pPager); } #ifndef SQLITE_OMIT_VACUUM @@ -6502,52 +6732,78 @@ const char *sqlite3BtreeGetJournalname(Btree *p){ ** The size of file pBtFrom may be reduced by this operation. ** If anything goes wrong, the transaction on pBtFrom is rolled back. */ -int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){ +static int btreeCopyFile(Btree *pTo, Btree *pFrom){ int rc = SQLITE_OK; Pgno i, nPage, nToPage, iSkip; BtShared *pBtTo = pTo->pBt; BtShared *pBtFrom = pFrom->pBt; + pBtTo->db = pTo->db; + pBtFrom->db = pFrom->db; + if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){ return SQLITE_ERROR; } if( pBtTo->pCursor ) return SQLITE_BUSY; - nToPage = sqlite3pager_pagecount(pBtTo->pPager); - nPage = sqlite3pager_pagecount(pBtFrom->pPager); + nToPage = sqlite3PagerPagecount(pBtTo->pPager); + nPage = sqlite3PagerPagecount(pBtFrom->pPager); iSkip = PENDING_BYTE_PAGE(pBtTo); for(i=1; rc==SQLITE_OK && i<=nPage; i++){ - void *pPage; + DbPage *pDbPage; if( i==iSkip ) continue; - rc = sqlite3pager_get(pBtFrom->pPager, i, &pPage); + rc = sqlite3PagerGet(pBtFrom->pPager, i, &pDbPage); if( rc ) break; - rc = sqlite3pager_overwrite(pBtTo->pPager, i, pPage); - if( rc ) break; - sqlite3pager_unref(pPage); + rc = sqlite3PagerOverwrite(pBtTo->pPager, i, sqlite3PagerGetData(pDbPage)); + sqlite3PagerUnref(pDbPage); } + + /* If the file is shrinking, journal the pages that are being truncated + ** so that they can be rolled back if the commit fails. + */ for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){ - void *pPage; + DbPage *pDbPage; if( i==iSkip ) continue; - rc = sqlite3pager_get(pBtTo->pPager, i, &pPage); + rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage); if( rc ) break; - rc = sqlite3pager_write(pPage); - sqlite3pager_unref(pPage); - sqlite3pager_dont_write(pBtTo->pPager, i); + rc = sqlite3PagerWrite(pDbPage); + sqlite3PagerDontWrite(pDbPage); + /* Yeah. It seems wierd to call DontWrite() right after Write(). But + ** that is because the names of those procedures do not exactly + ** represent what they do. Write() really means "put this page in the + ** rollback journal and mark it as dirty so that it will be written + ** to the database file later." DontWrite() undoes the second part of + ** that and prevents the page from being written to the database. The + ** page is still on the rollback journal, though. And that is the whole + ** point of this loop: to put pages on the rollback journal. */ + sqlite3PagerUnref(pDbPage); } if( !rc && nPage<nToPage ){ - rc = sqlite3pager_truncate(pBtTo->pPager, nPage); + rc = sqlite3PagerTruncate(pBtTo->pPager, nPage); } + if( rc ){ sqlite3BtreeRollback(pTo); } return rc; } +int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){ + int rc; + sqlite3BtreeEnter(pTo); + sqlite3BtreeEnter(pFrom); + rc = btreeCopyFile(pTo, pFrom); + sqlite3BtreeLeave(pFrom); + sqlite3BtreeLeave(pTo); + return rc; +} + #endif /* SQLITE_OMIT_VACUUM */ /* ** Return non-zero if a transaction is active. */ int sqlite3BtreeIsInTrans(Btree *p){ + assert( p==0 || sqlite3_mutex_held(p->db->mutex) ); return (p && (p->inTrans==TRANS_WRITE)); } @@ -6555,6 +6811,7 @@ int sqlite3BtreeIsInTrans(Btree *p){ ** Return non-zero if a statement transaction is active. */ int sqlite3BtreeIsInStmt(Btree *p){ + assert( sqlite3BtreeHoldsMutex(p) ); return (p->pBt && p->pBt->inStmt); } @@ -6562,44 +6819,13 @@ int sqlite3BtreeIsInStmt(Btree *p){ ** Return non-zero if a read (or write) transaction is active. */ int sqlite3BtreeIsInReadTrans(Btree *p){ + assert( sqlite3_mutex_held(p->db->mutex) ); return (p && (p->inTrans!=TRANS_NONE)); } /* -** This call is a no-op if no write-transaction is currently active on pBt. -** -** Otherwise, sync the database file for the btree pBt. zMaster points to -** the name of a master journal file that should be written into the -** individual journal file, or is NULL, indicating no master journal file -** (single database transaction). -** -** When this is called, the master journal should already have been -** created, populated with this journal pointer and synced to disk. -** -** Once this is routine has returned, the only thing required to commit -** the write-transaction for this database file is to delete the journal. -*/ -int sqlite3BtreeSync(Btree *p, const char *zMaster){ - int rc = SQLITE_OK; - if( p->inTrans==TRANS_WRITE ){ - BtShared *pBt = p->pBt; - Pgno nTrunc = 0; -#ifndef SQLITE_OMIT_AUTOVACUUM - if( pBt->autoVacuum ){ - rc = autoVacuumCommit(pBt, &nTrunc); - if( rc!=SQLITE_OK ){ - return rc; - } - } -#endif - rc = sqlite3pager_sync(pBt->pPager, zMaster, nTrunc); - } - return rc; -} - -/* ** This function returns a pointer to a blob of memory associated with -** a single shared-btree. The memory is used by client code for it's own +** a single shared-btree. The memory is used by client code for its own ** purposes (for example, to store a high-level schema associated with ** the shared-btree). The btree layer manages reference counting issues. ** @@ -6610,15 +6836,17 @@ int sqlite3BtreeSync(Btree *p, const char *zMaster){ ** ** Just before the shared-btree is closed, the function passed as the ** xFree argument when the memory allocation was made is invoked on the -** blob of allocated memory. This function should not call sqliteFree() +** blob of allocated memory. This function should not call sqlite3_free() ** on the memory, the btree layer does that. */ void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){ BtShared *pBt = p->pBt; + sqlite3BtreeEnter(p); if( !pBt->pSchema ){ - pBt->pSchema = sqliteMalloc(nBytes); + pBt->pSchema = sqlite3MallocZero(nBytes); pBt->xFreeSchema = xFree; } + sqlite3BtreeLeave(p); return pBt->pSchema; } @@ -6627,7 +6855,12 @@ void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){ ** handle holds an exclusive lock on the sqlite_master table. */ int sqlite3BtreeSchemaLocked(Btree *p){ - return (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK); + int rc; + assert( sqlite3_mutex_held(p->db->mutex) ); + sqlite3BtreeEnter(p); + rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK); + sqlite3BtreeLeave(p); + return rc; } @@ -6640,40 +6873,71 @@ int sqlite3BtreeSchemaLocked(Btree *p){ int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){ int rc = SQLITE_OK; u8 lockType = (isWriteLock?WRITE_LOCK:READ_LOCK); + sqlite3BtreeEnter(p); rc = queryTableLock(p, iTab, lockType); if( rc==SQLITE_OK ){ rc = lockTable(p, iTab, lockType); } + sqlite3BtreeLeave(p); return rc; } #endif -/* -** The following debugging interface has to be in this file (rather -** than in, for example, test1.c) so that it can get access to -** the definition of BtShared. -*/ -#if defined(SQLITE_DEBUG) && defined(TCLSH) -#include <tcl.h> -int sqlite3_shared_cache_report( - void * clientData, - Tcl_Interp *interp, - int objc, - Tcl_Obj *CONST objv[] -){ -#ifndef SQLITE_OMIT_SHARED_CACHE - const ThreadData *pTd = sqlite3ThreadDataReadOnly(); - if( pTd->useSharedData ){ - BtShared *pBt; - Tcl_Obj *pRet = Tcl_NewObj(); - for(pBt=pTd->pBtree; pBt; pBt=pBt->pNext){ - const char *zFile = sqlite3pager_filename(pBt->pPager); - Tcl_ListObjAppendElement(interp, pRet, Tcl_NewStringObj(zFile, -1)); - Tcl_ListObjAppendElement(interp, pRet, Tcl_NewIntObj(pBt->nRef)); +#ifndef SQLITE_OMIT_INCRBLOB +/* +** Argument pCsr must be a cursor opened for writing on an +** INTKEY table currently pointing at a valid table entry. +** This function modifies the data stored as part of that entry. +** Only the data content may only be modified, it is not possible +** to change the length of the data stored. +*/ +int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ + assert( cursorHoldsMutex(pCsr) ); + assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) ); + assert(pCsr->isIncrblobHandle); + if( pCsr->eState>=CURSOR_REQUIRESEEK ){ + if( pCsr->eState==CURSOR_FAULT ){ + return pCsr->skip; + }else{ + return SQLITE_ABORT; } - Tcl_SetObjResult(interp, pRet); } -#endif - return TCL_OK; + + /* Check some preconditions: + ** (a) the cursor is open for writing, + ** (b) there is no read-lock on the table being modified and + ** (c) the cursor points at a valid row of an intKey table. + */ + if( !pCsr->wrFlag ){ + return SQLITE_READONLY; + } + assert( !pCsr->pBt->readOnly + && pCsr->pBt->inTransaction==TRANS_WRITE ); + if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr) ){ + return SQLITE_LOCKED; /* The table pCur points to has a read lock */ + } + if( pCsr->eState==CURSOR_INVALID || !pCsr->pPage->intKey ){ + return SQLITE_ERROR; + } + + return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1); +} + +/* +** Set a flag on this cursor to cache the locations of pages from the +** overflow list for the current row. This is used by cursors opened +** for incremental blob IO only. +** +** This function sets a flag only. The actual page location cache +** (stored in BtCursor.aOverflow[]) is allocated and used by function +** accessPayload() (the worker function for sqlite3BtreeData() and +** sqlite3BtreePutData()). +*/ +void sqlite3BtreeCacheOverflow(BtCursor *pCur){ + assert( cursorHoldsMutex(pCur) ); + assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); + assert(!pCur->isIncrblobHandle); + assert(!pCur->aOverflow); + pCur->isIncrblobHandle = 1; } #endif |
