2 * Copyright (c) 2005 Teodor Sigaev <teodor@sigaev.ru>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the author nor the names of any co-contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS
18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
25 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
27 * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 #include <sys/types.h>
45 static int TBTReadPage(TBTree *db, u_int32_t pagenumber, TBTMemPage **page) ;
46 static int TBTWritePage(TBTree *db, TBTMemPage *page) ;
47 static int TBTNewPage(TBTree *db, TBTMemPage **page) ;
50 TBTOpen(TBTree *db, char *file) {
54 db->fd = open(file, O_RDONLY);
56 tlog(TL_CRIT,"TBTOpen: open failed: %s",strerror(errno));
59 if ( flock( db->fd, LOCK_SH ) < 0 ) {
60 tlog(TL_CRIT,"TBTOpen: flock failed: %s",strerror(errno));
65 db->fd = open(file, O_CREAT | O_RDWR, 0666);
67 tlog(TL_CRIT,"TBTOpen: open failed: %s",strerror(errno));
70 if ( flock( db->fd, LOCK_EX ) < 0 ) {
71 tlog(TL_CRIT,"TBTOpen: flock failed: %s",strerror(errno));
78 db->Cache=(TBTMemPage**)t0malloc( sizeof(TBTMemPage*)*db->npage );
79 db->TimeCache=(TBTMemPage**)t0malloc( sizeof(TBTMemPage*)*db->npage );
83 db->cachehits = db->pageread = db->pagewrite = 0;
85 if ( (offset=lseek(db->fd, 0, SEEK_END)) < 0 ) {
86 tlog(TL_CRIT,"TBTOpen: lseek failed: %s",strerror(errno));
87 flock( db->fd, LOCK_UN );
94 tlog(TL_CRIT,"TBTOpen: file is void");
95 flock( db->fd, LOCK_UN );
100 if ( TBTNewPage(db, &page) != TBT_OK ) {
101 flock( db->fd, LOCK_UN );
106 if ( TBTWritePage(db, page)!=TBT_OK ){
107 flock( db->fd, LOCK_UN );
114 db->lastpagenumber=0;
119 TBTClose(TBTree *db) {
128 for(i=0; i<db->curpage; i++)
129 tfree( db->Cache[i] );
132 tfree( db->TimeCache );
135 flock( db->fd, LOCK_UN );
138 return (rc) ? TBT_ERROR : TBT_OK;
142 TBTSync(TBTree *db) {
146 if ( db->readonly || db->npage==0 )
149 for(i=0; i<db->curpage; i++)
150 if ( !db->Cache[i]->issynced )
151 rc |= TBTWritePage(db, db->Cache[i]);
153 if ( fsync(db->fd) ) {
154 tlog(TL_CRIT,"TBTSync: fsync failed: %s",strerror(errno));
158 return (rc) ? TBT_ERROR : TBT_OK;
162 PageAlloc(TBTree *db, int *pos) {
165 if ( db->curpage < db->npage ) {
166 page = db->Cache[db->curpage] = db->TimeCache[db->curpage] = (TBTMemPage*)t0malloc(sizeof(TBTMemPage));
170 } else if ( db->npage == 0 ) {
171 page = (TBTMemPage*)t0malloc(sizeof(TBTMemPage));
174 for( i=0; i<db->curpage && db->TimeCache[i]->islocked; i++ );
175 if ( i == db->curpage ) { /*all pages locked*/
176 /*tlog(TL_WARN,"TBTReadPage: all pages in cache locked, increase number of cached pages");*/
177 page = (TBTMemPage*)t0malloc(sizeof(TBTMemPage));
179 page = db->TimeCache[i];
180 if ( !page->issynced )
181 if ( TBTWritePage(db, page) )
183 memset(page,0,sizeof(TBTMemPage));
193 moveFirstTime(TBTree *db, int pos) {
198 if (pos+1==db->curpage )
201 ptr = db->TimeCache[pos];
202 memmove( db->TimeCache + pos , db->TimeCache + pos + 1, sizeof(TBTMemPage*)*(db->curpage-pos-1) );
203 db->TimeCache[db->curpage-1] = ptr;
207 findAndMoveFirstTime(TBTree *db, TBTMemPage *page) {
209 for(pos=0;pos<db->curpage;pos++)
210 if ( page==db->TimeCache[pos] ) {
211 moveFirstTime(db,pos);
214 tassert( pos<db->curpage );
218 findAndMovePN(TBTree *db, TBTMemPage *page) {
221 if ( db->curpage < 2 )
224 for(pos=0;pos<db->curpage;pos++)
225 if ( page==db->Cache[pos] )
227 tassert( pos<db->curpage );
230 if ( pos+1 != db->curpage && db->Cache[pos+1]->pagenumber < page->pagenumber ) {
231 db->Cache[pos] = db->Cache[pos+1];
232 db->Cache[pos+1] = page;
234 } else if ( pos!=0 && db->Cache[pos-1]->pagenumber > page->pagenumber ) {
235 db->Cache[pos] = db->Cache[pos-1];
236 db->Cache[pos-1] = page;
244 cmpNPage(const void *a, const void *b) {
245 if ( (*(TBTMemPage**)a)->pagenumber == (*(TBTMemPage**)b)->pagenumber )
247 return ( (*(TBTMemPage**)a)->pagenumber > (*(TBTMemPage**)b)->pagenumber ) ? 1 : -1;
251 TBTReadPage(TBTree *db, u_int32_t pagenumber, TBTMemPage **page) {
252 TBTMemPage *ptr, key, **res;
257 key.pagenumber = pagenumber;
259 res = (TBTMemPage**)bsearch(&ptr, db->Cache, db->curpage, sizeof(TBTMemPage*), cmpNPage);
263 findAndMoveFirstTime(db, *page);
265 off_t offset = (off_t)pagenumber * (off_t)TBTREEPAGESIZE;
267 if( (*page = PageAlloc(db,&pos))==NULL )
270 if ( lseek(db->fd, offset, SEEK_SET) != offset ) {
271 tlog(TL_CRIT,"TBTReadPage: lseek failed: %s",strerror(errno));
275 (*page)->pagenumber=pagenumber;
276 if ( (rc=read(db->fd, &( (*page)->page ), TBTREEPAGESIZE)) != TBTREEPAGESIZE ) {
277 tlog(TL_CRIT,"TBTReadPage: read failed: %s %d %d",strerror(errno), pagenumber, offset);
280 if ( (*page)->iscached ) {
281 moveFirstTime(db, pos);
282 findAndMovePN(db, *page);
287 gettimeofday( &( (*page)->last ), NULL );
293 TBTWritePage(TBTree *db, TBTMemPage *page) {
294 off_t offset = (off_t)(page->pagenumber) * (off_t)TBTREEPAGESIZE;
297 if ( page->issynced )
300 if ( lseek(db->fd, offset, SEEK_SET) != offset ) {
301 tlog(TL_CRIT,"TBTWritePage: lseek failed: %s",strerror(errno));
305 if ( (size=write(db->fd, &( page->page ), TBTREEPAGESIZE)) != TBTREEPAGESIZE ) {
306 tlog(TL_CRIT,"TBTWritePage: write failed: %s (writed only %d bytes)",strerror(errno), size);
317 TBTNewPage(TBTree *db, TBTMemPage **page) {
319 if ( db->lastpagenumber==0 ) {
321 if ( (offset=lseek(db->fd, 0, SEEK_END)) < 0 ) {
322 tlog(TL_CRIT,"TBTNewPage: lseek failed: %s",strerror(errno));
326 if ( offset % TBTREEPAGESIZE != 0 ) {
327 tlog(TL_CRIT,"TBTNewPage: file corrupted");
331 db->lastpagenumber = offset/TBTREEPAGESIZE;
334 if( (*page = PageAlloc(db,&pos))==NULL )
336 (*page)->pagenumber = db->lastpagenumber;
340 db->lastpagenumber++;
342 memset( &((*page)->page), 0, sizeof(TBTPAGEHDRSZ));
343 (*page)->page.freespace = TBTREEPAGESIZE-TBTPAGEHDRSZ;
345 gettimeofday( &( (*page)->last ), NULL );
346 if ( (*page)->iscached ) {
347 moveFirstTime(db, pos);
348 findAndMovePN(db, *page);
358 printValue(u_int32_t len, char *ptr) {
368 dumpPage(TBTree *db, TBTPage *page, int follow) {
371 for(j=0;j<follow;j++)
373 printf("Page: f:%d l:%d r:%d n:%d\n",
379 for(i=0;i<page->npointer;i++) {
380 TBTPointer *ptr= (TBTPointer*)(page->data+TBTPOINTERSIZE(db)*i);
381 for(j=0;j<follow+2;j++)
383 printf("i:%d(%d) kl:%d ko:%d ",
385 (db->keylen) ? db->keylen : ptr->key.varlen.length,
386 (db->keylen) ? 0 : ptr->key.varlen.offset
389 printValue(ptr->key.varlen.length, page->data + ptr->key.varlen.offset);
391 printf("'%d'", *(int*)(ptr->key.fixed.key));
392 if ( page->isleaf ) {
393 printf(" vl:%d vo:%d ",
394 ptr->pointer.leaf.length,
395 ptr->pointer.leaf.offset
397 printValue(ptr->pointer.leaf.length, page->data + ptr->pointer.leaf.offset );
400 printf(" page:%d\n", ptr->pointer.internal.link);
402 dumpTree(db, ptr->pointer.internal.link, follow+4);
408 dumpTree(TBTree *db, u_int32_t pagenumber, int follow) {
411 if ( TBTReadPage(db, pagenumber, &page) ) {
412 fprintf(stderr,"BEDA %d\n", pagenumber);
417 dumpPage(db, &(page->page), follow);
419 if ( !page->iscached )
424 findInPage(TBTree *db, TBTPage *page, TBTValue *key, TBTPointer **res) {
425 TBTPointer *StopLow = (TBTPointer*)(page->data);
426 TBTPointer *StopHigh = (TBTPointer*)(page->data + TBTPOINTERSIZE(db)*page->npointer);
427 TBTPointer *StopMiddle;
431 /* Loop invariant: StopLow <= StopMiddle < StopHigh */
432 while (StopLow < StopHigh) {
433 StopMiddle = (TBTPointer*)(((char*)StopLow) + TBTPOINTERSIZE(db)*((((char*)StopHigh - (char*)StopLow)/TBTPOINTERSIZE(db)) >> 1));
434 if ( db->keylen == 0 ) {
435 A.length = StopMiddle->key.varlen.length;
436 A.value = page->data + StopMiddle->key.varlen.offset;
438 A.length = db->keylen;
439 A.value = StopMiddle->key.fixed.key;
442 if ( ISINFPOINTER(db, page, StopMiddle) )
445 diff = db->cmpkey( &A, key );
449 } else if ( diff < 0 )
450 StopLow = (TBTPointer*)((char*)StopMiddle + TBTPOINTERSIZE(db));
452 StopHigh = StopMiddle;
460 findPage(TBTree *db, TBTValue *key, TBTMemPage **page) {
461 u_int32_t pagenumber = TBTPAGEROOT;
468 if ( (rc=TBTReadPage(db, pagenumber, page))!=TBT_OK )
471 if ( (*page)->page.isleaf )
474 findInPage(db, &((*page)->page), key, &ptr );
475 pagenumber = ptr->pointer.internal.link;
476 if ( !(*page)->iscached )
484 TBTFind(TBTree *db, TBTValue *key, TBTValue *value) {
489 memset(value, 0, sizeof(TBTValue));
491 if ( (rc=findPage(db, key, &page))!=TBT_OK )
497 if ( findInPage(db, &(page->page), key, &ptr) != FOUND ) {
498 if ( !(page)->iscached )
503 value->length = ptr->pointer.leaf.length;
504 value->value = tmalloc(value->length);
505 memcpy( value->value, page->page.data + ptr->pointer.leaf.offset, value->length );
507 if ( !(page)->iscached )
514 packLeafKV(TBTree *db, TBTPage *page, TBTPointer *ptr, TBTValue *key, TBTValue *value) {
516 if ( (char*)ptr - page->data != page->npointer * TBTPOINTERSIZE(db) )
517 memmove((char*)ptr+TBTPOINTERSIZE(db), ptr, page->npointer * TBTPOINTERSIZE(db) - ( (char*)ptr - page->data ));
519 ptr = (TBTPointer*)(page->data + page->npointer * TBTPOINTERSIZE(db));
522 page->freespace -= TBTPOINTERSIZE(db) + PTRALIGN(value->length);
524 memcpy( ptr->key.fixed.key, key->value, db->keylen );
525 ptr->pointer.leaf.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
527 page->freespace -= PTRALIGN(key->length);
528 ptr->key.varlen.length = key->length;
529 ptr->key.varlen.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
530 memcpy( page->data + ptr->key.varlen.offset, key->value, key->length);
531 ptr->pointer.leaf.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace + PTRALIGN(key->length);
533 ptr->pointer.leaf.length = value->length;
534 memcpy( page->data + ptr->pointer.leaf.offset, value->value, value->length );
538 packInternalKV(TBTree *db, TBTPage *page, TBTPointer *ptr, TBTValue *key, u_int32_t pagenumber) {
540 if ( (char*)ptr - page->data != page->npointer * TBTPOINTERSIZE(db) )
541 memmove((char*)ptr+TBTPOINTERSIZE(db), ptr, page->npointer * TBTPOINTERSIZE(db) - ( (char*)ptr - page->data ));
543 ptr = (TBTPointer*)(page->data + page->npointer * TBTPOINTERSIZE(db));
546 page->freespace -= TBTPOINTERSIZE(db);
550 memcpy( ptr->key.fixed.key, key->value, db->keylen );
552 memset( ptr->key.fixed.key, 0, db->keylen );
554 page->freespace -= PTRALIGN(key->length);
555 ptr->key.varlen.length = key->length;
556 ptr->key.varlen.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
558 memcpy( page->data + ptr->key.varlen.offset, key->value, key->length);
560 ptr->pointer.internal.link = pagenumber;
565 deleteKV(TBTree *db, TBTPage *page, TBTPointer **ptrs, u_int32_t num) {
566 static char buf[TBTREEPAGESIZE];
568 TBTPointer *ptr = (TBTPointer*)page->data, *tmp, **ptrptr = ptrs;
570 /* suppose ptrs are ordered! */
572 while( ptrptr - ptrs < num && (char*)ptr - page->data < TBTPOINTERSIZE(db)*page->npointer ) {
573 if ( *ptrptr < ptr ) {
574 tlog(TL_CRIT,"deleteKV: Something wrong: unexistent *ptrs");
576 } else if ( *ptrptr > ptr ) {
578 memcpy(tmp, ptr, TBTPOINTERSIZE(db));
579 ptr = (TBTPointer*)( (char*)ptr + TBTPOINTERSIZE(db) );
580 tmp = (TBTPointer*)( (char*)tmp + TBTPOINTERSIZE(db) );
582 page->freespace += TBTPOINTERSIZE(db) +
583 ( ( page->isleaf ) ? PTRALIGN(ptr->pointer.leaf.length) : 0 ) +
584 ( ( db->keylen ) ? 0 : PTRALIGN(ptr->key.varlen.length) );
586 ptr = (TBTPointer*)( (char*)ptr + TBTPOINTERSIZE(db) );
590 if ( (char*)ptr - page->data < TBTPOINTERSIZE(db)*page->npointer && tmp!=ptr )
591 memmove( tmp, ptr, page->npointer * TBTPOINTERSIZE(db) - ((char*)ptr - page->data) );
595 tmp = (TBTPointer*)page->data;
596 start = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
597 for(i=0;i<page->npointer;i++) {
598 if ( page->isleaf ) {
599 memcpy( bufptr, page->data + tmp->pointer.leaf.offset, tmp->pointer.leaf.length );
600 tmp->pointer.leaf.offset = start + (bufptr-buf);
601 bufptr+=PTRALIGN(tmp->pointer.leaf.length);
604 if ( db->keylen == 0 ) {
605 if ( tmp->key.varlen.length )
606 memcpy( bufptr, page->data + tmp->key.varlen.offset, tmp->key.varlen.length );
607 tmp->key.varlen.offset = start + (bufptr-buf);
608 bufptr+=PTRALIGN(tmp->key.varlen.length);
610 tmp = (TBTPointer*)( (char*)tmp + TBTPOINTERSIZE(db) );
613 memcpy( page->data + start, buf, bufptr-buf );
617 findDelimiter(TBTree *db, TBTPage *page, TBTPointer *ptr, int size, u_int32_t start, int *where) {
618 int *sizes = (int*) tmalloc( sizeof(int) * (page->npointer+1) );
621 int lfree=TBTREEPAGESIZE-TBTPAGEHDRSZ, rfree=TBTREEPAGESIZE-TBTPAGEHDRSZ;
622 int nptr = ( (char*)ptr - page->data ) / TBTPOINTERSIZE(db);
625 for(i=0; i<page->npointer;i++) {
626 tomove = (TBTPointer*)(page->data + i*TBTPOINTERSIZE(db));
627 if ( was==0 && i==nptr ) {
631 sizes[i+was] = TBTPOINTERSIZE(db) +
632 ( (page->isleaf) ? PTRALIGN(tomove->pointer.leaf.length) : 0 ) +
633 ( ( db->keylen ) ? 0 : PTRALIGN(tomove->key.varlen.length) );
638 sizes[page->npointer]=size;
640 for(i=0;i<page->npointer+1;i++) {
652 } else if ( rfree < 0 ) {
660 *where = (nptr < start) ? -1 : 1;
669 populatePages(TBTree *db, TBTPage *srcpage, TBTPage* newpage, u_int32_t start) {
671 TBTPointer *tomove, **todelete = tmalloc( sizeof(TBTPointer*)* srcpage->npointer );
672 u_int32_t numtodelete=0;
675 for(i=start;i<srcpage->npointer;i++) {
676 tomove = (TBTPointer*)(srcpage->data + i*TBTPOINTERSIZE(db));
678 todelete[numtodelete] = tomove;
682 k.value = tomove->key.fixed.key;
684 k.length = tomove->key.varlen.length;
685 k.value = srcpage->data + tomove->key.varlen.offset;
688 if ( srcpage->isleaf ) {
689 v.length = tomove->pointer.leaf.length;
690 v.value = srcpage->data + tomove->pointer.leaf.offset;
691 packLeafKV(db, newpage, NULL, &k, &v);
693 packInternalKV(db, newpage, NULL, &k, tomove->pointer.internal.link);
696 deleteKV(db, srcpage, todelete, numtodelete);
701 splitPage(TBTree *db, TBTMemPage *srcpage, TBTMemPage** newpage, TBTPointer **ptr, int size, TBTValue *key, TBTValue *value, u_int32_t pagenumber) {
703 int start=0, where=0;
704 int nptr = ( (char*)(*ptr) - srcpage->page.data ) / TBTPOINTERSIZE(db);
706 if ( TBTNewPage(db, newpage) )
710 srcpage->issynced=tmp->issynced=0;
712 tmp->page.rightlink = srcpage->page.rightlink;
713 srcpage->page.rightlink = tmp->pagenumber;
714 tmp->page.isleaf = srcpage->page.isleaf;
716 switch(db->strategy) {
717 case TBT_STATEGY_HALF:
718 start = findDelimiter(db, &(srcpage->page), *ptr, size, (srcpage->page.npointer+1)/2, &where);
720 case TBT_STATEGY_LEFTFILL:
721 start = findDelimiter(db, &(srcpage->page), *ptr, size, srcpage->page.npointer, &where);
723 case TBT_STATEGY_RIGHTFILL:
724 start = findDelimiter(db, &(srcpage->page), *ptr, size, 0, &where);
727 tlog(TL_CRIT,"splitPage: unknown strategy: %d", db->strategy);
731 populatePages(db, &(srcpage->page), &(tmp->page), start);
736 *ptr = (TBTPointer*)(tmp->page.data + nptr*TBTPOINTERSIZE(db));
738 if ( tmp->page.isleaf )
739 packLeafKV(db, &(tmp->page), *ptr, key, value);
741 packInternalKV(db, &(tmp->page), *ptr, key, pagenumber);
747 savePage(TBTree *db, TBTMemPage *page) {
751 if ( !page->iscached ) {
752 if ( page->issynced==0 && (rc=TBTWritePage(db, page)) != TBT_OK )
760 layerInsert(TBTree *db, u_int32_t pagenumber, TBTMemPage **left, TBTMemPage **right, TBTValue *key, TBTValue *value) {
762 TBTPointer *ptr=NULL;
763 TBTMemPage *page=NULL, *newright=NULL;
765 if ( (rc=TBTReadPage(db, pagenumber, &page))!=TBT_OK )
770 fnd = findInPage(db, &(page->page), key, &ptr );
773 if ( page->page.isleaf ) {
774 u_int32_t size = TBTPOINTERSIZE(db) +
775 PTRALIGN(value->length) +
776 ( ( db->keylen ) ? 0 : PTRALIGN(key->length) );
778 if ( size>(TBTREEPAGESIZE-TBTPAGEHDRSZ)/2 ) {
779 tlog(TL_ALARM,"Huge size: %d bytes", size);
784 deleteKV(db, &(page->page), &ptr, 1);
788 if ( size <= page->page.freespace ) {
789 packLeafKV(db, &(page->page), ptr, key, value);
792 rc = splitPage(db, page, &newright, &ptr, size, key, value, 0);
793 if ( rc != TBT_OK ) return rc;
795 page->issynced = newright->issynced = 0;
796 page->islocked = newright->islocked = 1;
799 if ( (rc=layerInsert(db, ptr->pointer.internal.link, left, right, key, value))!=TBT_OK ) {
804 if ( *left ) { /* child page was splited */
806 TBTPointer *rightptr = (TBTPointer*)( (*left)->page.data + ((*left)->page.npointer-1) * TBTPOINTERSIZE(db) );
809 if ( ISINFPOINTER(db, &(page->page), ptr) ) {
810 deleteKV(db, &(page->page), &ptr, 1);
811 K.length = 0; K.value = NULL;
812 packInternalKV(db, &(page->page), ptr, &K, (*right)->pagenumber);
814 ptr->pointer.internal.link=(*right)->pagenumber;
816 K.length = ( db->keylen ) ? db->keylen : rightptr->key.varlen.length;
817 K.value = ( db->keylen ) ? rightptr->key.fixed.key : (*left)->page.data + rightptr->key.varlen.offset;
819 size = TBTPOINTERSIZE(db) + ( ( db->keylen ) ? 0 : PTRALIGN(K.length) );
821 if ( size <= page->page.freespace ) {
822 packInternalKV(db, &(page->page), ptr, &K, (*left)->pagenumber);
825 rc = splitPage(db, page, &newright, &ptr, size, &K, NULL, (*left)->pagenumber);
826 if ( rc != TBT_OK ) return rc;
828 page->issynced = newright->issynced = 0;
829 page->islocked = newright->islocked = 1;
835 if ( (rc=savePage(db, *left))!=TBT_OK )
837 if ( (rc=savePage(db, *right))!=TBT_OK )
843 if ( page->pagenumber==TBTPAGEROOT ) { /* root was splited */
848 if ( (rc=TBTNewPage(db,&newleft))!=TBT_OK )
850 memcpy( &(newleft->page) , &(page->page), TBTREEPAGESIZE);
851 memset( &(page->page), 0, TBTREEPAGESIZE);
852 page->page.freespace=TBTREEPAGESIZE-TBTPAGEHDRSZ;
854 ptr = (TBTPointer*)( newleft->page.data + (newleft->page.npointer-1) * TBTPOINTERSIZE(db) );
855 K.length = ( db->keylen ) ? db->keylen : ptr->key.varlen.length;
856 K.value = ( db->keylen ) ? ptr->key.fixed.key : newleft->page.data + ptr->key.varlen.offset;
857 packInternalKV(db, &(page->page), NULL, &K, newleft->pagenumber);
859 ptr = (TBTPointer*)( newright->page.data + (newright->page.npointer-1) * TBTPOINTERSIZE(db) );
862 packInternalKV(db, &(page->page), NULL, &K, newright->pagenumber);
864 if ( (rc=savePage(db, newright))!=TBT_OK )
866 if ( (rc=savePage(db, newleft))!=TBT_OK )
868 if ( (rc=savePage(db, page))!=TBT_OK )
874 } else if ( (rc=savePage(db, page))!=TBT_OK )
882 TBTInsert(TBTree *db, TBTValue *key, TBTValue *value) {
884 TBTMemPage *lpage=NULL, *rpage=NULL;
886 rc = layerInsert(db, TBTPAGEROOT, &lpage, &rpage, key, value);
888 tassert( lpage==NULL && rpage==NULL );
894 TBTDelete(TBTree *db, TBTValue *key) {
899 if ( (rc=findPage(db, key, &page))!=TBT_OK )
905 if ( findInPage(db, &(page->page), key, &ptr) != FOUND ) {
906 if ( !(page)->iscached )
911 deleteKV(db, &(page->page), &ptr, 1);
914 rc=savePage(db, page);
920 TBTInitIterator(TBTree *db, TBTIterator *iterator) {
921 TBTMemPage *page=NULL;
924 u_int32_t pagenum=TBTPAGEROOT;
926 memset(iterator, 0, sizeof(TBTIterator));
929 if ( page && !page->iscached )
931 if ( (rc=TBTReadPage(db, pagenum, &page))!=TBT_OK )
934 ptr = (TBTPointer*)( page->page.data );
935 pagenum = ptr->pointer.internal.link;
936 } while( !page->page.isleaf );
939 iterator->page = page;
946 TBTFreeIterator(TBTree *db, TBTIterator *iterator) {
948 if ( iterator->page ) {
949 iterator->page->islocked=0;
950 if ( !iterator->page->iscached )
951 tfree(iterator->page);
954 memset(iterator, 0, sizeof(TBTIterator));
958 TBTIterate(TBTree *db, TBTIterator *iterator, TBTValue *key, TBTValue *value ) {
961 if ( iterator->page==NULL ) {
962 memset(key, 0, sizeof(TBTValue));
963 memset(value, 0, sizeof(TBTValue));
967 if ( (char*)(iterator->ptr) - iterator->page->page.data >= TBTPOINTERSIZE(db) * iterator->page->page.npointer ) {
969 u_int32_t pagenumber = iterator->page->page.rightlink;
970 iterator->page->islocked=0;
971 if ( !iterator->page->iscached )
972 tfree(iterator->page);
975 if ( pagenumber==TBTPAGEROOT ) {
976 memset(key, 0, sizeof(TBTValue));
977 memset(value, 0, sizeof(TBTValue));
980 if ( (rc=TBTReadPage(db, pagenumber, &(iterator->page)))!=TBT_OK )
982 } while ( iterator->page->page.npointer == 0 );
984 iterator->page->islocked=1;
985 iterator->ptr = (TBTPointer*)( iterator->page->page.data );
989 key->length = db->keylen;
990 key->value = iterator->ptr->key.fixed.key;
992 key->length = iterator->ptr->key.varlen.length;
993 key->value = iterator->page->page.data + iterator->ptr->key.varlen.offset;
996 value->length = iterator->ptr->pointer.leaf.length;
997 value->value = iterator->page->page.data + iterator->ptr->pointer.leaf.offset;
999 iterator->ptr = ( TBTPointer* ) ( (char*)(iterator->ptr) + TBTPOINTERSIZE(db) );