2 * Copyright (c) 2005 Teodor Sigaev <teodor@sigaev.ru>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the author nor the names of any co-contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS
18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
25 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
27 * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 #include <sys/types.h>
44 static int TBTReadPage(TBTree *db, u_int32_t pagenumber, TBTMemPage **page) ;
45 static int TBTWritePage(TBTree *db, TBTMemPage *page) ;
46 static int TBTNewPage(TBTree *db, TBTMemPage **page) ;
49 TBTOpen(TBTree *db, char *file) {
53 db->fd = open(file, O_RDONLY);
55 tlog(TL_CRIT,"TBTOpen: open failed: %s",strerror(errno));
58 if ( flock( db->fd, LOCK_SH ) < 0 ) {
59 tlog(TL_CRIT,"TBTOpen: flock failed: %s",strerror(errno));
64 db->fd = open(file, O_CREAT | O_RDWR, 0666);
66 tlog(TL_CRIT,"TBTOpen: open failed: %s",strerror(errno));
69 if ( flock( db->fd, LOCK_EX ) < 0 ) {
70 tlog(TL_CRIT,"TBTOpen: flock failed: %s",strerror(errno));
77 db->Cache=(TBTMemPage**)t0malloc( sizeof(TBTMemPage*)*db->npage );
78 db->TimeCache=(TBTMemPage**)t0malloc( sizeof(TBTMemPage*)*db->npage );
82 db->cachehits = db->pageread = db->pagewrite = 0;
84 if ( (offset=lseek(db->fd, 0, SEEK_END)) < 0 ) {
85 tlog(TL_CRIT,"TBTOpen: lseek failed: %s",strerror(errno));
86 flock( db->fd, LOCK_UN );
93 tlog(TL_CRIT,"TBTOpen: file is void");
94 flock( db->fd, LOCK_UN );
99 if ( TBTNewPage(db, &page) != TBT_OK ) {
100 flock( db->fd, LOCK_UN );
105 if ( TBTWritePage(db, page)!=TBT_OK ){
106 flock( db->fd, LOCK_UN );
113 db->lastpagenumber=0;
118 TBTClose(TBTree *db) {
127 for(i=0; i<db->curpage; i++)
128 tfree( db->Cache[i] );
131 tfree( db->TimeCache );
134 flock( db->fd, LOCK_UN );
137 return (rc) ? TBT_ERROR : TBT_OK;
141 TBTSync(TBTree *db) {
145 if ( db->readonly || db->npage==0 )
148 for(i=0; i<db->curpage; i++)
149 if ( !db->Cache[i]->issynced )
150 rc |= TBTWritePage(db, db->Cache[i]);
152 if ( fsync(db->fd) ) {
153 tlog(TL_CRIT,"TBTSync: fsync failed: %s",strerror(errno));
157 return (rc) ? TBT_ERROR : TBT_OK;
161 PageAlloc(TBTree *db, int *pos) {
164 if ( db->curpage < db->npage ) {
165 page = db->Cache[db->curpage] = db->TimeCache[db->curpage] = (TBTMemPage*)t0malloc(sizeof(TBTMemPage));
169 } else if ( db->npage == 0 ) {
170 page = (TBTMemPage*)t0malloc(sizeof(TBTMemPage));
173 for( i=0; i<db->curpage && db->TimeCache[i]->islocked; i++ );
174 if ( i == db->curpage ) { /*all pages locked*/
175 /*tlog(TL_WARN,"TBTReadPage: all pages in cache locked, increase number of cached pages");*/
176 page = (TBTMemPage*)t0malloc(sizeof(TBTMemPage));
178 page = db->TimeCache[i];
179 if ( !page->issynced )
180 if ( TBTWritePage(db, page) )
182 memset(page,0,sizeof(TBTMemPage));
192 moveFirstTime(TBTree *db, int pos) {
197 if (pos+1==db->curpage )
200 ptr = db->TimeCache[pos];
201 memmove( db->TimeCache + pos , db->TimeCache + pos + 1, sizeof(TBTMemPage*)*(db->curpage-pos-1) );
202 db->TimeCache[db->curpage-1] = ptr;
206 findAndMoveFirstTime(TBTree *db, TBTMemPage *page) {
208 for(pos=0;pos<db->curpage;pos++)
209 if ( page==db->TimeCache[pos] ) {
210 moveFirstTime(db,pos);
213 tassert( pos<db->curpage );
217 findAndMovePN(TBTree *db, TBTMemPage *page) {
220 if ( db->curpage < 2 )
223 for(pos=0;pos<db->curpage;pos++)
224 if ( page==db->Cache[pos] )
226 tassert( pos<db->curpage );
229 if ( pos+1 != db->curpage && db->Cache[pos+1]->pagenumber < page->pagenumber ) {
230 db->Cache[pos] = db->Cache[pos+1];
231 db->Cache[pos+1] = page;
233 } else if ( pos!=0 && db->Cache[pos-1]->pagenumber > page->pagenumber ) {
234 db->Cache[pos] = db->Cache[pos-1];
235 db->Cache[pos-1] = page;
243 cmpNPage(const void *a, const void *b) {
244 if ( (*(TBTMemPage**)a)->pagenumber == (*(TBTMemPage**)b)->pagenumber )
246 return ( (*(TBTMemPage**)a)->pagenumber > (*(TBTMemPage**)b)->pagenumber ) ? 1 : -1;
250 TBTReadPage(TBTree *db, u_int32_t pagenumber, TBTMemPage **page) {
251 TBTMemPage *ptr, key, **res;
256 key.pagenumber = pagenumber;
258 res = (TBTMemPage**)bsearch(&ptr, db->Cache, db->curpage, sizeof(TBTMemPage*), cmpNPage);
262 findAndMoveFirstTime(db, *page);
264 off_t offset = (off_t)pagenumber * (off_t)TBTREEPAGESIZE;
266 if( (*page = PageAlloc(db,&pos))==NULL )
269 if ( lseek(db->fd, offset, SEEK_SET) != offset ) {
270 tlog(TL_CRIT,"TBTReadPage: lseek failed: %s",strerror(errno));
274 (*page)->pagenumber=pagenumber;
275 if ( (rc=read(db->fd, &( (*page)->page ), TBTREEPAGESIZE)) != TBTREEPAGESIZE ) {
276 tlog(TL_CRIT,"TBTReadPage: read failed: %s %d %d",strerror(errno), pagenumber, offset);
279 if ( (*page)->iscached ) {
280 moveFirstTime(db, pos);
281 findAndMovePN(db, *page);
286 gettimeofday( &( (*page)->last ), NULL );
292 TBTWritePage(TBTree *db, TBTMemPage *page) {
293 off_t offset = (off_t)(page->pagenumber) * (off_t)TBTREEPAGESIZE;
296 if ( page->issynced )
299 if ( lseek(db->fd, offset, SEEK_SET) != offset ) {
300 tlog(TL_CRIT,"TBTWritePage: lseek failed: %s",strerror(errno));
304 if ( (size=write(db->fd, &( page->page ), TBTREEPAGESIZE)) != TBTREEPAGESIZE ) {
305 tlog(TL_CRIT,"TBTWritePage: write failed: %s (writed only %d bytes)",strerror(errno), size);
316 TBTNewPage(TBTree *db, TBTMemPage **page) {
318 if ( db->lastpagenumber==0 ) {
320 if ( (offset=lseek(db->fd, 0, SEEK_END)) < 0 ) {
321 tlog(TL_CRIT,"TBTNewPage: lseek failed: %s",strerror(errno));
325 if ( offset % TBTREEPAGESIZE != 0 ) {
326 tlog(TL_CRIT,"TBTNewPage: file corrupted");
330 db->lastpagenumber = offset/TBTREEPAGESIZE;
333 if( (*page = PageAlloc(db,&pos))==NULL )
335 (*page)->pagenumber = db->lastpagenumber;
339 db->lastpagenumber++;
341 memset( &((*page)->page), 0, sizeof(TBTPAGEHDRSZ));
342 (*page)->page.freespace = TBTREEPAGESIZE-TBTPAGEHDRSZ;
344 gettimeofday( &( (*page)->last ), NULL );
345 if ( (*page)->iscached ) {
346 moveFirstTime(db, pos);
347 findAndMovePN(db, *page);
357 printValue(u_int32_t len, char *ptr) {
367 dumpPage(TBTree *db, TBTPage *page, int follow) {
370 for(j=0;j<follow;j++)
372 printf("Page: f:%d l:%d r:%d n:%d\n",
378 for(i=0;i<page->npointer;i++) {
379 TBTPointer *ptr= (TBTPointer*)(page->data+TBTPOINTERSIZE(db)*i);
380 for(j=0;j<follow+2;j++)
382 printf("i:%d(%d) kl:%d ko:%d ",
384 (db->keylen) ? db->keylen : ptr->key.varlen.length,
385 (db->keylen) ? 0 : ptr->key.varlen.offset
388 printValue(ptr->key.varlen.length, page->data + ptr->key.varlen.offset);
390 printf("'%d'", *(int*)(ptr->key.fixed.key));
391 if ( page->isleaf ) {
392 printf(" vl:%d vo:%d ",
393 ptr->pointer.leaf.length,
394 ptr->pointer.leaf.offset
396 printValue(ptr->pointer.leaf.length, page->data + ptr->pointer.leaf.offset );
399 printf(" page:%d\n", ptr->pointer.internal.link);
401 dumpTree(db, ptr->pointer.internal.link, follow+4);
407 dumpTree(TBTree *db, u_int32_t pagenumber, int follow) {
410 if ( TBTReadPage(db, pagenumber, &page) ) {
411 fprintf(stderr,"BEDA %d\n", pagenumber);
416 dumpPage(db, &(page->page), follow);
418 if ( !page->iscached )
423 findInPage(TBTree *db, TBTPage *page, TBTValue *key, TBTPointer **res) {
424 TBTPointer *StopLow = (TBTPointer*)(page->data);
425 TBTPointer *StopHigh = (TBTPointer*)(page->data + TBTPOINTERSIZE(db)*page->npointer);
426 TBTPointer *StopMiddle;
430 /* Loop invariant: StopLow <= StopMiddle < StopHigh */
431 while (StopLow < StopHigh) {
432 StopMiddle = (TBTPointer*)(((char*)StopLow) + TBTPOINTERSIZE(db)*((((char*)StopHigh - (char*)StopLow)/TBTPOINTERSIZE(db)) >> 1));
433 if ( db->keylen == 0 ) {
434 A.length = StopMiddle->key.varlen.length;
435 A.value = page->data + StopMiddle->key.varlen.offset;
437 A.length = db->keylen;
438 A.value = StopMiddle->key.fixed.key;
441 if ( ISINFPOINTER(db, page, StopMiddle) )
444 diff = db->cmpkey( &A, key );
448 } else if ( diff < 0 )
449 StopLow = (TBTPointer*)((char*)StopMiddle + TBTPOINTERSIZE(db));
451 StopHigh = StopMiddle;
459 findPage(TBTree *db, TBTValue *key, TBTMemPage **page) {
460 u_int32_t pagenumber = TBTPAGEROOT;
467 if ( (rc=TBTReadPage(db, pagenumber, page))!=TBT_OK )
470 if ( (*page)->page.isleaf )
473 findInPage(db, &((*page)->page), key, &ptr );
474 pagenumber = ptr->pointer.internal.link;
475 if ( !(*page)->iscached )
483 TBTFind(TBTree *db, TBTValue *key, TBTValue *value) {
488 memset(value, 0, sizeof(TBTValue));
490 if ( (rc=findPage(db, key, &page))!=TBT_OK )
496 if ( findInPage(db, &(page->page), key, &ptr) != FOUND ) {
497 if ( !(page)->iscached )
502 value->length = ptr->pointer.leaf.length;
503 value->value = tmalloc(value->length);
504 memcpy( value->value, page->page.data + ptr->pointer.leaf.offset, value->length );
506 if ( !(page)->iscached )
513 packLeafKV(TBTree *db, TBTPage *page, TBTPointer *ptr, TBTValue *key, TBTValue *value) {
515 if ( (char*)ptr - page->data != page->npointer * TBTPOINTERSIZE(db) )
516 memmove((char*)ptr+TBTPOINTERSIZE(db), ptr, page->npointer * TBTPOINTERSIZE(db) - ( (char*)ptr - page->data ));
518 ptr = (TBTPointer*)(page->data + page->npointer * TBTPOINTERSIZE(db));
521 page->freespace -= TBTPOINTERSIZE(db) + PTRALIGN(value->length);
523 memcpy( ptr->key.fixed.key, key->value, db->keylen );
524 ptr->pointer.leaf.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
526 page->freespace -= PTRALIGN(key->length);
527 ptr->key.varlen.length = key->length;
528 ptr->key.varlen.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
529 memcpy( page->data + ptr->key.varlen.offset, key->value, key->length);
530 ptr->pointer.leaf.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace + PTRALIGN(key->length);
532 ptr->pointer.leaf.length = value->length;
533 memcpy( page->data + ptr->pointer.leaf.offset, value->value, value->length );
537 packInternalKV(TBTree *db, TBTPage *page, TBTPointer *ptr, TBTValue *key, u_int32_t pagenumber) {
539 if ( (char*)ptr - page->data != page->npointer * TBTPOINTERSIZE(db) )
540 memmove((char*)ptr+TBTPOINTERSIZE(db), ptr, page->npointer * TBTPOINTERSIZE(db) - ( (char*)ptr - page->data ));
542 ptr = (TBTPointer*)(page->data + page->npointer * TBTPOINTERSIZE(db));
545 page->freespace -= TBTPOINTERSIZE(db);
549 memcpy( ptr->key.fixed.key, key->value, db->keylen );
551 memset( ptr->key.fixed.key, 0, db->keylen );
553 page->freespace -= PTRALIGN(key->length);
554 ptr->key.varlen.length = key->length;
555 ptr->key.varlen.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
557 memcpy( page->data + ptr->key.varlen.offset, key->value, key->length);
559 ptr->pointer.internal.link = pagenumber;
564 deleteKV(TBTree *db, TBTPage *page, TBTPointer **ptrs, u_int32_t num) {
565 static char buf[TBTREEPAGESIZE];
567 TBTPointer *ptr = (TBTPointer*)page->data, *tmp, **ptrptr = ptrs;
569 /* suppose ptrs are ordered! */
571 while( ptrptr - ptrs < num && (char*)ptr - page->data < TBTPOINTERSIZE(db)*page->npointer ) {
572 if ( *ptrptr < ptr ) {
573 tlog(TL_CRIT,"deleteKV: Something wrong: unexistent *ptrs");
575 } else if ( *ptrptr > ptr ) {
577 memcpy(tmp, ptr, TBTPOINTERSIZE(db));
578 ptr = (TBTPointer*)( (char*)ptr + TBTPOINTERSIZE(db) );
579 tmp = (TBTPointer*)( (char*)tmp + TBTPOINTERSIZE(db) );
581 page->freespace += TBTPOINTERSIZE(db) +
582 ( ( page->isleaf ) ? PTRALIGN(ptr->pointer.leaf.length) : 0 ) +
583 ( ( db->keylen ) ? 0 : PTRALIGN(ptr->key.varlen.length) );
585 ptr = (TBTPointer*)( (char*)ptr + TBTPOINTERSIZE(db) );
589 if ( (char*)ptr - page->data < TBTPOINTERSIZE(db)*page->npointer && tmp!=ptr )
590 memmove( tmp, ptr, page->npointer * TBTPOINTERSIZE(db) - ((char*)ptr - page->data) );
594 tmp = (TBTPointer*)page->data;
595 start = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
596 for(i=0;i<page->npointer;i++) {
597 if ( page->isleaf ) {
598 memcpy( bufptr, page->data + tmp->pointer.leaf.offset, tmp->pointer.leaf.length );
599 tmp->pointer.leaf.offset = start + (bufptr-buf);
600 bufptr+=PTRALIGN(tmp->pointer.leaf.length);
603 if ( db->keylen == 0 ) {
604 if ( tmp->key.varlen.length )
605 memcpy( bufptr, page->data + tmp->key.varlen.offset, tmp->key.varlen.length );
606 tmp->key.varlen.offset = start + (bufptr-buf);
607 bufptr+=PTRALIGN(tmp->key.varlen.length);
609 tmp = (TBTPointer*)( (char*)tmp + TBTPOINTERSIZE(db) );
612 memcpy( page->data + start, buf, bufptr-buf );
616 findDelimiter(TBTree *db, TBTPage *page, TBTPointer *ptr, int size, u_int32_t start, int *where) {
617 int *sizes = (int*) tmalloc( sizeof(int) * (page->npointer+1) );
620 int lfree=TBTREEPAGESIZE-TBTPAGEHDRSZ, rfree=TBTREEPAGESIZE-TBTPAGEHDRSZ;
621 int nptr = ( (char*)ptr - page->data ) / TBTPOINTERSIZE(db);
624 for(i=0; i<page->npointer;i++) {
625 tomove = (TBTPointer*)(page->data + i*TBTPOINTERSIZE(db));
626 if ( was==0 && i==nptr ) {
630 sizes[i+was] = TBTPOINTERSIZE(db) +
631 ( (page->isleaf) ? PTRALIGN(tomove->pointer.leaf.length) : 0 ) +
632 ( ( db->keylen ) ? 0 : PTRALIGN(tomove->key.varlen.length) );
637 sizes[page->npointer]=size;
639 for(i=0;i<page->npointer+1;i++) {
651 } else if ( rfree < 0 ) {
659 *where = (nptr < start) ? -1 : 1;
668 populatePages(TBTree *db, TBTPage *srcpage, TBTPage* newpage, u_int32_t start) {
670 TBTPointer *tomove, **todelete = tmalloc( sizeof(TBTPointer*)* srcpage->npointer );
671 u_int32_t numtodelete=0;
674 for(i=start;i<srcpage->npointer;i++) {
675 tomove = (TBTPointer*)(srcpage->data + i*TBTPOINTERSIZE(db));
677 todelete[numtodelete] = tomove;
681 k.value = tomove->key.fixed.key;
683 k.length = tomove->key.varlen.length;
684 k.value = srcpage->data + tomove->key.varlen.offset;
687 if ( srcpage->isleaf ) {
688 v.length = tomove->pointer.leaf.length;
689 v.value = srcpage->data + tomove->pointer.leaf.offset;
690 packLeafKV(db, newpage, NULL, &k, &v);
692 packInternalKV(db, newpage, NULL, &k, tomove->pointer.internal.link);
695 deleteKV(db, srcpage, todelete, numtodelete);
700 splitPage(TBTree *db, TBTMemPage *srcpage, TBTMemPage** newpage, TBTPointer **ptr, int size, TBTValue *key, TBTValue *value, u_int32_t pagenumber) {
702 int start=0, where=0;
703 int nptr = ( (char*)(*ptr) - srcpage->page.data ) / TBTPOINTERSIZE(db);
705 if ( TBTNewPage(db, newpage) )
709 srcpage->issynced=tmp->issynced=0;
711 tmp->page.rightlink = srcpage->page.rightlink;
712 srcpage->page.rightlink = tmp->pagenumber;
713 tmp->page.isleaf = srcpage->page.isleaf;
715 switch(db->strategy) {
716 case TBT_STATEGY_HALF:
717 start = findDelimiter(db, &(srcpage->page), *ptr, size, (srcpage->page.npointer+1)/2, &where);
719 case TBT_STATEGY_LEFTFILL:
720 start = findDelimiter(db, &(srcpage->page), *ptr, size, srcpage->page.npointer, &where);
722 case TBT_STATEGY_RIGHTFILL:
723 start = findDelimiter(db, &(srcpage->page), *ptr, size, 0, &where);
726 tlog(TL_CRIT,"splitPage: unknown strategy: %d", db->strategy);
730 populatePages(db, &(srcpage->page), &(tmp->page), start);
735 *ptr = (TBTPointer*)(tmp->page.data + nptr*TBTPOINTERSIZE(db));
737 if ( tmp->page.isleaf )
738 packLeafKV(db, &(tmp->page), *ptr, key, value);
740 packInternalKV(db, &(tmp->page), *ptr, key, pagenumber);
746 savePage(TBTree *db, TBTMemPage *page) {
750 if ( !page->iscached ) {
751 if ( page->issynced==0 && (rc=TBTWritePage(db, page)) != TBT_OK )
759 layerInsert(TBTree *db, u_int32_t pagenumber, TBTMemPage **left, TBTMemPage **right, TBTValue *key, TBTValue *value) {
761 TBTPointer *ptr=NULL;
762 TBTMemPage *page=NULL, *newright=NULL;
764 if ( (rc=TBTReadPage(db, pagenumber, &page))!=TBT_OK )
769 fnd = findInPage(db, &(page->page), key, &ptr );
772 if ( page->page.isleaf ) {
773 u_int32_t size = TBTPOINTERSIZE(db) +
774 PTRALIGN(value->length) +
775 ( ( db->keylen ) ? 0 : PTRALIGN(key->length) );
777 if ( size>(TBTREEPAGESIZE-TBTPAGEHDRSZ)/2 ) {
778 tlog(TL_ALARM,"Huge size: %d bytes", size);
783 deleteKV(db, &(page->page), &ptr, 1);
787 if ( size <= page->page.freespace ) {
788 packLeafKV(db, &(page->page), ptr, key, value);
791 rc = splitPage(db, page, &newright, &ptr, size, key, value, 0);
792 if ( rc != TBT_OK ) return rc;
794 page->issynced = newright->issynced = 0;
795 page->islocked = newright->islocked = 1;
798 if ( (rc=layerInsert(db, ptr->pointer.internal.link, left, right, key, value))!=TBT_OK ) {
803 if ( *left ) { /* child page was splited */
805 TBTPointer *rightptr = (TBTPointer*)( (*left)->page.data + ((*left)->page.npointer-1) * TBTPOINTERSIZE(db) );
808 if ( ISINFPOINTER(db, &(page->page), ptr) ) {
809 deleteKV(db, &(page->page), &ptr, 1);
810 K.length = 0; K.value = NULL;
811 packInternalKV(db, &(page->page), ptr, &K, (*right)->pagenumber);
813 ptr->pointer.internal.link=(*right)->pagenumber;
815 K.length = ( db->keylen ) ? db->keylen : rightptr->key.varlen.length;
816 K.value = ( db->keylen ) ? rightptr->key.fixed.key : (*left)->page.data + rightptr->key.varlen.offset;
818 size = TBTPOINTERSIZE(db) + ( ( db->keylen ) ? 0 : PTRALIGN(K.length) );
820 if ( size <= page->page.freespace ) {
821 packInternalKV(db, &(page->page), ptr, &K, (*left)->pagenumber);
824 rc = splitPage(db, page, &newright, &ptr, size, &K, NULL, (*left)->pagenumber);
825 if ( rc != TBT_OK ) return rc;
827 page->issynced = newright->issynced = 0;
828 page->islocked = newright->islocked = 1;
834 if ( (rc=savePage(db, *left))!=TBT_OK )
836 if ( (rc=savePage(db, *right))!=TBT_OK )
842 if ( page->pagenumber==TBTPAGEROOT ) { /* root was splited */
847 if ( (rc=TBTNewPage(db,&newleft))!=TBT_OK )
849 memcpy( &(newleft->page) , &(page->page), TBTREEPAGESIZE);
850 memset( &(page->page), 0, TBTREEPAGESIZE);
851 page->page.freespace=TBTREEPAGESIZE-TBTPAGEHDRSZ;
853 ptr = (TBTPointer*)( newleft->page.data + (newleft->page.npointer-1) * TBTPOINTERSIZE(db) );
854 K.length = ( db->keylen ) ? db->keylen : ptr->key.varlen.length;
855 K.value = ( db->keylen ) ? ptr->key.fixed.key : newleft->page.data + ptr->key.varlen.offset;
856 packInternalKV(db, &(page->page), NULL, &K, newleft->pagenumber);
858 ptr = (TBTPointer*)( newright->page.data + (newright->page.npointer-1) * TBTPOINTERSIZE(db) );
861 packInternalKV(db, &(page->page), NULL, &K, newright->pagenumber);
863 if ( (rc=savePage(db, newright))!=TBT_OK )
865 if ( (rc=savePage(db, newleft))!=TBT_OK )
867 if ( (rc=savePage(db, page))!=TBT_OK )
873 } else if ( (rc=savePage(db, page))!=TBT_OK )
881 TBTInsert(TBTree *db, TBTValue *key, TBTValue *value) {
883 TBTMemPage *lpage=NULL, *rpage=NULL;
885 rc = layerInsert(db, TBTPAGEROOT, &lpage, &rpage, key, value);
887 tassert( lpage==NULL && rpage==NULL );
893 TBTDelete(TBTree *db, TBTValue *key) {
898 if ( (rc=findPage(db, key, &page))!=TBT_OK )
904 if ( findInPage(db, &(page->page), key, &ptr) != FOUND ) {
905 if ( !(page)->iscached )
910 deleteKV(db, &(page->page), &ptr, 1);
913 rc=savePage(db, page);
919 TBTInitIterator(TBTree *db, TBTIterator *iterator) {
920 TBTMemPage *page=NULL;
923 u_int32_t pagenum=TBTPAGEROOT;
925 memset(iterator, 0, sizeof(TBTIterator));
928 if ( page && !page->iscached )
930 if ( (rc=TBTReadPage(db, pagenum, &page))!=TBT_OK )
933 ptr = (TBTPointer*)( page->page.data );
934 pagenum = ptr->pointer.internal.link;
935 } while( !page->page.isleaf );
938 iterator->page = page;
945 TBTFreeIterator(TBTree *db, TBTIterator *iterator) {
947 if ( iterator->page ) {
948 iterator->page->islocked=0;
949 if ( !iterator->page->iscached )
950 tfree(iterator->page);
953 memset(iterator, 0, sizeof(TBTIterator));
957 TBTIterate(TBTree *db, TBTIterator *iterator, TBTValue *key, TBTValue *value ) {
960 if ( iterator->page==NULL ) {
961 memset(key, 0, sizeof(TBTValue));
962 memset(value, 0, sizeof(TBTValue));
966 if ( (char*)(iterator->ptr) - iterator->page->page.data >= TBTPOINTERSIZE(db) * iterator->page->page.npointer ) {
968 u_int32_t pagenumber = iterator->page->page.rightlink;
969 iterator->page->islocked=0;
970 if ( !iterator->page->iscached )
971 tfree(iterator->page);
974 if ( pagenumber==TBTPAGEROOT ) {
975 memset(key, 0, sizeof(TBTValue));
976 memset(value, 0, sizeof(TBTValue));
979 if ( (rc=TBTReadPage(db, pagenumber, &(iterator->page)))!=TBT_OK )
981 } while ( iterator->page->page.npointer == 0 );
983 iterator->page->islocked=1;
984 iterator->ptr = (TBTPointer*)( iterator->page->page.data );
988 key->length = db->keylen;
989 key->value = iterator->ptr->key.fixed.key;
991 key->length = iterator->ptr->key.varlen.length;
992 key->value = iterator->page->page.data + iterator->ptr->key.varlen.offset;
995 value->length = iterator->ptr->pointer.leaf.length;
996 value->value = iterator->page->page.data + iterator->ptr->pointer.leaf.offset;
998 iterator->ptr = ( TBTPointer* ) ( (char*)(iterator->ptr) + TBTPOINTERSIZE(db) );