2 * Copyright (c) 2005 Teodor Sigaev <teodor@sigaev.ru>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the author nor the names of any co-contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS
18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
25 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
27 * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 #include <sys/types.h>
47 static int TBTReadPage(TBTree *db, u_int32_t pagenumber, TBTMemPage **page) ;
48 static int TBTWritePage(TBTree *db, TBTMemPage *page) ;
49 static int TBTNewPage(TBTree *db, TBTMemPage **page) ;
54 TBTOpen(TBTree *db, char *file) {
58 db->fd = open(file, O_RDONLY);
60 tlog(TL_CRIT,"TBTOpen: open failed: %s",strerror(errno));
63 if ( flock( db->fd, LOCK_SH ) < 0 ) {
64 tlog(TL_CRIT,"TBTOpen: flock failed: %s",strerror(errno));
69 db->fd = open(file, O_CREAT | O_RDWR, 0666);
71 tlog(TL_CRIT,"TBTOpen: open failed: %s",strerror(errno));
74 if ( flock( db->fd, LOCK_EX ) < 0 ) {
75 tlog(TL_CRIT,"TBTOpen: flock failed: %s",strerror(errno));
82 db->Cache=(TBTMemPage**)t0malloc( sizeof(TBTMemPage*)*HASHSIZE(db->npage) );
86 db->cachehits = db->pageread = db->pagewrite = 0;
88 if ( (offset=lseek(db->fd, 0, SEEK_END)) < 0 ) {
89 tlog(TL_CRIT,"TBTOpen: lseek failed: %s",strerror(errno));
90 flock( db->fd, LOCK_UN );
97 tlog(TL_CRIT,"TBTOpen: file is void");
98 flock( db->fd, LOCK_UN );
103 if ( TBTNewPage(db, &page) != TBT_OK ) {
104 flock( db->fd, LOCK_UN );
109 if ( TBTWritePage(db, page)!=TBT_OK ){
110 flock( db->fd, LOCK_UN );
117 db->lastpagenumber=0;
122 TBTClose(TBTree *db) {
130 TBTMemPage *ptr,*tmp;
132 for(i=0; i<HASHSIZE(db->npage); i++) {
144 flock( db->fd, LOCK_UN );
147 return (rc) ? TBT_ERROR : TBT_OK;
151 cmpNPage(const void *a, const void *b) {
152 tassert( (*(TBTMemPage**)a)->pagenumber != (*(TBTMemPage**)b)->pagenumber );
153 return ( (*(TBTMemPage**)a)->pagenumber > (*(TBTMemPage**)b)->pagenumber ) ? 1 : -1;
158 TBTSync(TBTree *db) {
161 if ( db->readonly || db->npage==0 )
165 u_int32_t i, total=0;
166 TBTMemPage *ptr, **data = (TBTMemPage**)tmalloc(db->npage*sizeof(TBTMemPage*)), **pptr;
168 for(i=0; i<HASHSIZE(db->npage); i++) {
171 if ( !ptr->issynced ) {
182 qsort( data, total, sizeof(TBTMemPage*), cmpNPage);
185 while( pptr-data<total) {
186 rc |= TBTWritePage(db, *pptr);
193 if ( fsync(db->fd) ) {
194 tlog(TL_CRIT,"TBTSync: fsync failed: %s",strerror(errno));
198 return (rc) ? TBT_ERROR : TBT_OK;
202 findInCache(TBTree *db, u_int32_t pagenumber) {
205 ptr = db->Cache[ pagenumber % HASHSIZE(db->npage) ];
207 while(ptr && ptr->pagenumber != pagenumber)
214 removeFromCache(TBTree *db, u_int32_t pagenumber) {
215 TBTMemPage *ptr, *prev=NULL;
216 int pos = pagenumber % HASHSIZE(db->npage);
218 ptr = db->Cache[ pos ];
221 if ( ptr->pagenumber == pagenumber ) {
223 prev->link = ptr->link;
225 db->Cache[ pos ] = ptr->link;
234 insertInCache(TBTree *db, TBTMemPage *page) {
235 int pos = page->pagenumber % HASHSIZE(db->npage);
237 page->link = db->Cache[ pos ];
238 db->Cache[ pos ] = page;
242 PageAlloc(TBTree *db) {
245 if ( db->curpage < db->npage ) {
246 page = (TBTMemPage*)tmalloc(sizeof(TBTMemPage));
247 memset( page, 0, TBTMEMPAGEHDRSZ );
249 if ( db->curpage == 0 ) {
250 db->TimeCache = db->TimeCacheLast = page;
252 db->TimeCacheLast->next = page;
253 page->prev = db->TimeCacheLast;
254 db->TimeCacheLast = page;
257 } else if ( db->npage == 0 ) {
258 page = (TBTMemPage*)tmalloc(sizeof(TBTMemPage));
259 memset( page, 0, TBTMEMPAGEHDRSZ );
261 page = db->TimeCache;
262 while( page->next && page->islocked )
264 if ( page==db->TimeCacheLast && page->islocked ) { /*all pages locked*/
265 /* tlog(TL_WARN,"TBTReadPage: all pages in cache locked, increase number of cached pages"); */
266 page = (TBTMemPage*)tmalloc(sizeof(TBTMemPage));
267 memset( page, 0, TBTMEMPAGEHDRSZ );
269 if ( !page->issynced )
270 if ( TBTWritePage(db, page) )
272 removeFromCache(db, page->pagenumber);
273 if ( page != db->TimeCacheLast ) {
274 if ( page == db->TimeCache ) {
275 db->TimeCache = page->next;
276 db->TimeCache->prev = NULL;
278 page->prev->next = page->next;
279 page->next->prev = page->prev;
281 memset( page, 0, TBTMEMPAGEHDRSZ );
282 db->TimeCacheLast->next = page;
283 page->prev = db->TimeCacheLast;
284 db->TimeCacheLast = page;
286 memset( page, 0, TBTMEMPAGEHDRSZ );
295 TBTReadPage(TBTree *db, u_int32_t pagenumber, TBTMemPage **page) {
298 if ( db->npage && (*page=findInCache(db, pagenumber))!=NULL ) {
300 if ( *page != db->TimeCacheLast ) {
301 if ( (*page) == db->TimeCache ) {
302 db->TimeCache = (*page)->next;
303 db->TimeCache->prev=NULL;
305 (*page)->prev->next = (*page)->next;
306 (*page)->next->prev = (*page)->prev;
308 db->TimeCacheLast->next = (*page);
309 (*page)->prev = db->TimeCacheLast;
310 db->TimeCacheLast = *page;
311 (*page)->next = NULL;
314 off_t offset = (off_t)pagenumber * (off_t)TBTREEPAGESIZE;
316 if( (*page = PageAlloc(db))==NULL )
319 if ( lseek(db->fd, offset, SEEK_SET) != offset ) {
320 tlog(TL_CRIT,"TBTReadPage: lseek failed: %s",strerror(errno));
324 (*page)->pagenumber=pagenumber;
325 if ( (rc=read(db->fd, &( (*page)->page ), TBTREEPAGESIZE)) != TBTREEPAGESIZE ) {
326 tlog(TL_CRIT,"TBTReadPage: read failed: %s %d %d",strerror(errno), pagenumber, offset);
329 if ( (*page)->iscached )
330 insertInCache(db, *page);
338 TBTWritePage(TBTree *db, TBTMemPage *page) {
339 off_t offset = (off_t)(page->pagenumber) * (off_t)TBTREEPAGESIZE;
342 if ( page->issynced )
345 if ( lseek(db->fd, offset, SEEK_SET) != offset ) {
346 tlog(TL_CRIT,"TBTWritePage: lseek failed: %s",strerror(errno));
350 if ( (size=write(db->fd, &( page->page ), TBTREEPAGESIZE)) != TBTREEPAGESIZE ) {
351 tlog(TL_CRIT,"TBTWritePage: write failed: %s (writed only %d bytes)",strerror(errno), size);
362 TBTNewPage(TBTree *db, TBTMemPage **page) {
363 if ( db->lastpagenumber==0 ) {
365 if ( (offset=lseek(db->fd, 0, SEEK_END)) < 0 ) {
366 tlog(TL_CRIT,"TBTNewPage: lseek failed: %s",strerror(errno));
370 if ( offset % TBTREEPAGESIZE != 0 ) {
371 tlog(TL_CRIT,"TBTNewPage: file corrupted");
375 db->lastpagenumber = offset/TBTREEPAGESIZE;
378 if( (*page = PageAlloc(db))==NULL )
381 (*page)->pagenumber = db->lastpagenumber;
385 db->lastpagenumber++;
387 memset( &((*page)->page), 0, TBTMEMPAGEHDRSZ);
388 (*page)->page.freespace = TBTREEPAGESIZE-TBTPAGEHDRSZ;
390 if ( (*page)->iscached )
391 insertInCache(db, *page);
397 printValue(u_int32_t len, char *ptr) {
407 dumpPage(TBTree *db, TBTPage *page, int follow) {
410 for(j=0;j<follow;j++)
412 printf("Page: f:%d l:%d r:%d n:%d\n",
418 for(i=0;i<page->npointer;i++) {
419 TBTPointer *ptr= (TBTPointer*)(page->data+TBTPOINTERSIZE(db)*i);
420 for(j=0;j<follow+2;j++)
422 printf("i:%d(%d) kl:%d ko:%d ",
424 (db->keylen) ? db->keylen : ptr->key.varlen.length,
425 (db->keylen) ? 0 : ptr->key.varlen.offset
428 printValue(ptr->key.varlen.length, page->data + ptr->key.varlen.offset);
430 printf("'%d'", *(int*)(ptr->key.fixed.key));
431 if ( page->isleaf ) {
432 printf(" vl:%d vo:%d ",
433 ptr->pointer.leaf.length,
434 ptr->pointer.leaf.offset
436 printValue(ptr->pointer.leaf.length, page->data + ptr->pointer.leaf.offset );
439 printf(" page:%d\n", ptr->pointer.internal.link);
441 dumpTree(db, ptr->pointer.internal.link, follow+4);
447 dumpTree(TBTree *db, u_int32_t pagenumber, int follow) {
450 if ( TBTReadPage(db, pagenumber, &page) ) {
451 tlog(TL_CRIT,"BEDA %d\n", pagenumber);
456 dumpPage(db, &(page->page), follow);
458 if ( !page->iscached )
463 findInPage(TBTree *db, TBTPage *page, TBTValue *key, TBTPointer **res) {
464 TBTPointer *StopLow = (TBTPointer*)(page->data);
465 TBTPointer *StopHigh = (TBTPointer*)(page->data + TBTPOINTERSIZE(db)*page->npointer);
466 TBTPointer *StopMiddle;
470 /* Loop invariant: StopLow <= StopMiddle < StopHigh */
471 while (StopLow < StopHigh) {
472 StopMiddle = (TBTPointer*)(((char*)StopLow) + TBTPOINTERSIZE(db)*((((char*)StopHigh - (char*)StopLow)/TBTPOINTERSIZE(db)) >> 1));
473 if ( db->keylen == 0 ) {
474 A.length = StopMiddle->key.varlen.length;
475 A.value = page->data + StopMiddle->key.varlen.offset;
477 A.length = db->keylen;
478 A.value = StopMiddle->key.fixed.key;
481 if ( ISINFPOINTER(db, page, StopMiddle) )
484 diff = db->cmpkey( &A, key );
488 } else if ( diff < 0 )
489 StopLow = (TBTPointer*)((char*)StopMiddle + TBTPOINTERSIZE(db));
491 StopHigh = StopMiddle;
499 findPage(TBTree *db, TBTValue *key, TBTMemPage **page) {
500 u_int32_t pagenumber = TBTPAGEROOT;
507 if ( (rc=TBTReadPage(db, pagenumber, page))!=TBT_OK )
510 if ( (*page)->page.isleaf )
513 findInPage(db, &((*page)->page), key, &ptr );
514 pagenumber = ptr->pointer.internal.link;
515 if ( !(*page)->iscached )
523 TBTFind(TBTree *db, TBTValue *key, TBTValue *value) {
528 memset(value, 0, sizeof(TBTValue));
530 if ( (rc=findPage(db, key, &page))!=TBT_OK )
536 if ( findInPage(db, &(page->page), key, &ptr) != FOUND ) {
537 if ( !(page)->iscached )
542 value->length = ptr->pointer.leaf.length;
543 value->value = tmalloc(value->length);
544 memcpy( value->value, page->page.data + ptr->pointer.leaf.offset, value->length );
546 if ( !(page)->iscached )
553 packLeafKV(TBTree *db, TBTPage *page, TBTPointer *ptr, TBTValue *key, TBTValue *value) {
555 if ( (char*)ptr - page->data != page->npointer * TBTPOINTERSIZE(db) )
556 memmove((char*)ptr+TBTPOINTERSIZE(db), ptr, page->npointer * TBTPOINTERSIZE(db) - ( (char*)ptr - page->data ));
558 ptr = (TBTPointer*)(page->data + page->npointer * TBTPOINTERSIZE(db));
561 page->freespace -= TBTPOINTERSIZE(db) + PTRALIGN(value->length);
563 memcpy( ptr->key.fixed.key, key->value, db->keylen );
564 ptr->pointer.leaf.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
566 page->freespace -= PTRALIGN(key->length);
567 ptr->key.varlen.length = key->length;
568 ptr->key.varlen.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
569 memcpy( page->data + ptr->key.varlen.offset, key->value, key->length);
570 ptr->pointer.leaf.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace + PTRALIGN(key->length);
572 ptr->pointer.leaf.length = value->length;
573 memcpy( page->data + ptr->pointer.leaf.offset, value->value, value->length );
577 packInternalKV(TBTree *db, TBTPage *page, TBTPointer *ptr, TBTValue *key, u_int32_t pagenumber) {
579 if ( (char*)ptr - page->data != page->npointer * TBTPOINTERSIZE(db) )
580 memmove((char*)ptr+TBTPOINTERSIZE(db), ptr, page->npointer * TBTPOINTERSIZE(db) - ( (char*)ptr - page->data ));
582 ptr = (TBTPointer*)(page->data + page->npointer * TBTPOINTERSIZE(db));
585 page->freespace -= TBTPOINTERSIZE(db);
589 memcpy( ptr->key.fixed.key, key->value, db->keylen );
591 memset( ptr->key.fixed.key, 0, db->keylen );
593 page->freespace -= PTRALIGN(key->length);
594 ptr->key.varlen.length = key->length;
595 ptr->key.varlen.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
597 memcpy( page->data + ptr->key.varlen.offset, key->value, key->length);
599 ptr->pointer.internal.link = pagenumber;
604 deleteKV(TBTree *db, TBTPage *page, TBTPointer **ptrs, u_int32_t num) {
605 static char buf[TBTREEPAGESIZE];
607 TBTPointer *ptr = (TBTPointer*)page->data, *tmp, **ptrptr = ptrs;
609 /* suppose ptrs are ordered! */
611 while( ptrptr - ptrs < num && (char*)ptr - page->data < TBTPOINTERSIZE(db)*page->npointer ) {
612 if ( *ptrptr < ptr ) {
613 tlog(TL_CRIT,"deleteKV: Something wrong: unexistent *ptrs");
615 } else if ( *ptrptr > ptr ) {
617 memcpy(tmp, ptr, TBTPOINTERSIZE(db));
618 ptr = (TBTPointer*)( (char*)ptr + TBTPOINTERSIZE(db) );
619 tmp = (TBTPointer*)( (char*)tmp + TBTPOINTERSIZE(db) );
621 page->freespace += TBTPOINTERSIZE(db) +
622 ( ( page->isleaf ) ? PTRALIGN(ptr->pointer.leaf.length) : 0 ) +
623 ( ( db->keylen ) ? 0 : PTRALIGN(ptr->key.varlen.length) );
625 ptr = (TBTPointer*)( (char*)ptr + TBTPOINTERSIZE(db) );
629 if ( (char*)ptr - page->data < TBTPOINTERSIZE(db)*page->npointer && tmp!=ptr )
630 memmove( tmp, ptr, page->npointer * TBTPOINTERSIZE(db) - ((char*)ptr - page->data) );
634 tmp = (TBTPointer*)page->data;
635 start = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
636 for(i=0;i<page->npointer;i++) {
637 if ( page->isleaf ) {
638 memcpy( bufptr, page->data + tmp->pointer.leaf.offset, tmp->pointer.leaf.length );
639 tmp->pointer.leaf.offset = start + (bufptr-buf);
640 bufptr+=PTRALIGN(tmp->pointer.leaf.length);
643 if ( db->keylen == 0 ) {
644 if ( tmp->key.varlen.length )
645 memcpy( bufptr, page->data + tmp->key.varlen.offset, tmp->key.varlen.length );
646 tmp->key.varlen.offset = start + (bufptr-buf);
647 bufptr+=PTRALIGN(tmp->key.varlen.length);
649 tmp = (TBTPointer*)( (char*)tmp + TBTPOINTERSIZE(db) );
652 memcpy( page->data + start, buf, bufptr-buf );
656 findDelimiter(TBTree *db, TBTPage *page, TBTPointer *ptr, int size, u_int32_t start, int *where) {
657 int *sizes = (int*) tmalloc( sizeof(int) * (page->npointer+1) );
660 int lfree=TBTREEPAGESIZE-TBTPAGEHDRSZ, rfree=TBTREEPAGESIZE-TBTPAGEHDRSZ;
661 int nptr = ( (char*)ptr - page->data ) / TBTPOINTERSIZE(db);
664 for(i=0; i<page->npointer;i++) {
665 tomove = (TBTPointer*)(page->data + i*TBTPOINTERSIZE(db));
666 if ( was==0 && i==nptr ) {
670 sizes[i+was] = TBTPOINTERSIZE(db) +
671 ( (page->isleaf) ? PTRALIGN(tomove->pointer.leaf.length) : 0 ) +
672 ( ( db->keylen ) ? 0 : PTRALIGN(tomove->key.varlen.length) );
677 sizes[page->npointer]=size;
679 for(i=0;i<page->npointer+1;i++) {
691 } else if ( rfree < 0 ) {
699 *where = (nptr < start) ? -1 : 1;
708 populatePages(TBTree *db, TBTPage *srcpage, TBTPage* newpage, u_int32_t start) {
710 TBTPointer *tomove, **todelete = tmalloc( sizeof(TBTPointer*)* srcpage->npointer );
711 u_int32_t numtodelete=0;
714 for(i=start;i<srcpage->npointer;i++) {
715 tomove = (TBTPointer*)(srcpage->data + i*TBTPOINTERSIZE(db));
717 todelete[numtodelete] = tomove;
721 k.value = tomove->key.fixed.key;
723 k.length = tomove->key.varlen.length;
724 k.value = srcpage->data + tomove->key.varlen.offset;
727 if ( srcpage->isleaf ) {
728 v.length = tomove->pointer.leaf.length;
729 v.value = srcpage->data + tomove->pointer.leaf.offset;
730 packLeafKV(db, newpage, NULL, &k, &v);
732 packInternalKV(db, newpage, NULL, &k, tomove->pointer.internal.link);
735 deleteKV(db, srcpage, todelete, numtodelete);
740 splitPage(TBTree *db, TBTMemPage *srcpage, TBTMemPage** newpage, TBTPointer **ptr, int size, TBTValue *key, TBTValue *value, u_int32_t pagenumber) {
742 int start=0, where=0;
743 int nptr = ( (char*)(*ptr) - srcpage->page.data ) / TBTPOINTERSIZE(db);
745 if ( TBTNewPage(db, newpage) )
749 srcpage->issynced=tmp->issynced=0;
751 tmp->page.rightlink = srcpage->page.rightlink;
752 srcpage->page.rightlink = tmp->pagenumber;
753 tmp->page.isleaf = srcpage->page.isleaf;
755 switch(db->strategy) {
756 case TBT_STATEGY_HALF:
757 start = findDelimiter(db, &(srcpage->page), *ptr, size, (srcpage->page.npointer+1)/2, &where);
759 case TBT_STATEGY_LEFTFILL:
760 start = findDelimiter(db, &(srcpage->page), *ptr, size, srcpage->page.npointer, &where);
762 case TBT_STATEGY_RIGHTFILL:
763 start = findDelimiter(db, &(srcpage->page), *ptr, size, 0, &where);
766 tlog(TL_CRIT,"splitPage: unknown strategy: %d", db->strategy);
770 populatePages(db, &(srcpage->page), &(tmp->page), start);
775 *ptr = (TBTPointer*)(tmp->page.data + nptr*TBTPOINTERSIZE(db));
777 if ( tmp->page.isleaf )
778 packLeafKV(db, &(tmp->page), *ptr, key, value);
780 packInternalKV(db, &(tmp->page), *ptr, key, pagenumber);
786 savePage(TBTree *db, TBTMemPage *page) {
790 if ( !page->iscached ) {
791 if ( page->issynced==0 && (rc=TBTWritePage(db, page)) != TBT_OK )
799 layerInsert(TBTree *db, u_int32_t pagenumber, TBTMemPage **left, TBTMemPage **right, TBTValue *key, TBTValue *value) {
801 TBTPointer *ptr=NULL;
802 TBTMemPage *page=NULL, *newright=NULL;
804 if ( (rc=TBTReadPage(db, pagenumber, &page))!=TBT_OK )
809 fnd = findInPage(db, &(page->page), key, &ptr );
812 if ( page->page.isleaf ) {
813 u_int32_t size = TBTPOINTERSIZE(db) +
814 PTRALIGN(value->length) +
815 ( ( db->keylen ) ? 0 : PTRALIGN(key->length) );
817 if ( size>(TBTREEPAGESIZE-TBTPAGEHDRSZ)/2 ) {
818 tlog(TL_ALARM,"Huge size: %d bytes", size);
823 deleteKV(db, &(page->page), &ptr, 1);
827 if ( size <= page->page.freespace ) {
828 packLeafKV(db, &(page->page), ptr, key, value);
831 rc = splitPage(db, page, &newright, &ptr, size, key, value, 0);
832 if ( rc != TBT_OK ) return rc;
834 page->issynced = newright->issynced = 0;
835 page->islocked = newright->islocked = 1;
838 if ( (rc=layerInsert(db, ptr->pointer.internal.link, left, right, key, value))!=TBT_OK ) {
843 if ( *left ) { /* child page was splited */
845 TBTPointer *rightptr = (TBTPointer*)( (*left)->page.data + ((*left)->page.npointer-1) * TBTPOINTERSIZE(db) );
848 if ( ISINFPOINTER(db, &(page->page), ptr) ) {
849 deleteKV(db, &(page->page), &ptr, 1);
850 K.length = 0; K.value = NULL;
851 packInternalKV(db, &(page->page), ptr, &K, (*right)->pagenumber);
853 ptr->pointer.internal.link=(*right)->pagenumber;
855 K.length = ( db->keylen ) ? db->keylen : rightptr->key.varlen.length;
856 K.value = ( db->keylen ) ? rightptr->key.fixed.key : (*left)->page.data + rightptr->key.varlen.offset;
858 size = TBTPOINTERSIZE(db) + ( ( db->keylen ) ? 0 : PTRALIGN(K.length) );
860 if ( size <= page->page.freespace ) {
861 packInternalKV(db, &(page->page), ptr, &K, (*left)->pagenumber);
864 rc = splitPage(db, page, &newright, &ptr, size, &K, NULL, (*left)->pagenumber);
865 if ( rc != TBT_OK ) return rc;
867 page->issynced = newright->issynced = 0;
868 page->islocked = newright->islocked = 1;
874 if ( (rc=savePage(db, *left))!=TBT_OK )
876 if ( (rc=savePage(db, *right))!=TBT_OK )
882 if ( page->pagenumber==TBTPAGEROOT ) { /* root was splited */
887 if ( (rc=TBTNewPage(db,&newleft))!=TBT_OK )
889 memcpy( &(newleft->page) , &(page->page), TBTREEPAGESIZE);
890 memset( &(page->page), 0, TBTREEPAGESIZE);
891 page->page.freespace=TBTREEPAGESIZE-TBTPAGEHDRSZ;
893 ptr = (TBTPointer*)( newleft->page.data + (newleft->page.npointer-1) * TBTPOINTERSIZE(db) );
894 K.length = ( db->keylen ) ? db->keylen : ptr->key.varlen.length;
895 K.value = ( db->keylen ) ? ptr->key.fixed.key : newleft->page.data + ptr->key.varlen.offset;
896 packInternalKV(db, &(page->page), NULL, &K, newleft->pagenumber);
898 ptr = (TBTPointer*)( newright->page.data + (newright->page.npointer-1) * TBTPOINTERSIZE(db) );
901 packInternalKV(db, &(page->page), NULL, &K, newright->pagenumber);
903 if ( (rc=savePage(db, newright))!=TBT_OK )
905 if ( (rc=savePage(db, newleft))!=TBT_OK )
907 if ( (rc=savePage(db, page))!=TBT_OK )
913 } else if ( (rc=savePage(db, page))!=TBT_OK )
921 TBTInsert(TBTree *db, TBTValue *key, TBTValue *value) {
923 TBTMemPage *lpage=NULL, *rpage=NULL;
925 rc = layerInsert(db, TBTPAGEROOT, &lpage, &rpage, key, value);
927 tassert( lpage==NULL && rpage==NULL );
933 TBTDelete(TBTree *db, TBTValue *key) {
938 if ( (rc=findPage(db, key, &page))!=TBT_OK )
944 if ( findInPage(db, &(page->page), key, &ptr) != FOUND ) {
945 if ( !(page)->iscached )
950 deleteKV(db, &(page->page), &ptr, 1);
953 rc=savePage(db, page);
959 TBTInitIterator(TBTree *db, TBTIterator *iterator) {
960 TBTMemPage *page=NULL;
963 u_int32_t pagenum=TBTPAGEROOT;
965 memset(iterator, 0, sizeof(TBTIterator));
968 if ( page && !page->iscached )
970 if ( (rc=TBTReadPage(db, pagenum, &page))!=TBT_OK )
973 ptr = (TBTPointer*)( page->page.data );
974 pagenum = ptr->pointer.internal.link;
975 } while( !page->page.isleaf );
978 iterator->page = page;
985 TBTFreeIterator(TBTree *db, TBTIterator *iterator) {
987 if ( iterator->page ) {
988 iterator->page->islocked=0;
989 if ( !iterator->page->iscached )
990 tfree(iterator->page);
993 memset(iterator, 0, sizeof(TBTIterator));
997 TBTIterate(TBTree *db, TBTIterator *iterator, TBTValue *key, TBTValue *value ) {
1000 if ( iterator->page==NULL ) {
1001 memset(key, 0, sizeof(TBTValue));
1002 memset(value, 0, sizeof(TBTValue));
1006 if ( (char*)(iterator->ptr) - iterator->page->page.data >= TBTPOINTERSIZE(db) * iterator->page->page.npointer ) {
1008 u_int32_t pagenumber = iterator->page->page.rightlink;
1009 iterator->page->islocked=0;
1010 if ( !iterator->page->iscached )
1011 tfree(iterator->page);
1012 iterator->page=NULL;
1014 if ( pagenumber==TBTPAGEROOT ) {
1015 memset(key, 0, sizeof(TBTValue));
1016 memset(value, 0, sizeof(TBTValue));
1019 if ( (rc=TBTReadPage(db, pagenumber, &(iterator->page)))!=TBT_OK )
1021 } while ( iterator->page->page.npointer == 0 );
1023 iterator->page->islocked=1;
1024 iterator->ptr = (TBTPointer*)( iterator->page->page.data );
1028 key->length = db->keylen;
1029 key->value = iterator->ptr->key.fixed.key;
1031 key->length = iterator->ptr->key.varlen.length;
1032 key->value = iterator->page->page.data + iterator->ptr->key.varlen.offset;
1035 value->length = iterator->ptr->pointer.leaf.length;
1036 value->value = iterator->page->page.data + iterator->ptr->pointer.leaf.offset;
1038 iterator->ptr = ( TBTPointer* ) ( (char*)(iterator->ptr) + TBTPOINTERSIZE(db) );