Clean up code
[tedtools.git] / sfxtest.c
1 /*
2  * Copyright (c) 2004 Teodor Sigaev <teodor@sigaev.ru>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *      notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *      notice, this list of conditions and the following disclaimer in the
12  *      documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the author nor the names of any co-contributors
14  *      may be used to endorse or promote products derived from this software
15  *      without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
25  * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
27  * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29
30 #include <stdio.h>
31 #include <string.h>
32 #include <stdlib.h>
33 #include <unistd.h>
34 #include <ctype.h>
35 #include <sys/time.h>
36
37 #include "tools.h"
38 #include "tlog.h"
39 #include "tmalloc.h"
40 #include "sfxstr.h"
41
42 extern char *optarg;
43 static int verbose=1;
44
45 static int
46 STRINGCMP(const void *a, const void *b) {
47         return strcmp( *(char**)a, *(char**)b );
48 }
49
50 static void
51 usage() {
52         puts("Usage:");
53         puts("./sfxtest -d datafile [-b| [-a addondatafile] [-l | -p | -r | -g] [-n]] [-q]");
54         puts("Detailed description:");
55
56         puts("  Binary search");
57         puts("    ./sfxtest -b -d datafile [-q]");
58         puts("       -q - quiet (no output)");
59
60         puts("  Optimal suffix search");
61         puts("    ./sfxtest -d datafile [-a addondatafile] [-l | -p | -r | -g]  [-n] [-q]");
62         puts("       -a addondatafile - addon file data for loading");
63         puts("       -g - histogramm mode");
64         puts("       -l - listing mode");
65         puts("       -p - prefix search mode");
66         puts("       -r - range search mode");
67         puts("       -n - enumerate entries");
68         puts("       -q - quiet (no output)");
69         
70         exit(1);
71 }
72
73 static void
74 gistogramm(SFSNode *node, int *gist) {
75         if ( !node )
76                 return;
77
78       if ( node->isskip ) {
79                if ( node->haschild ) {
80                        gist[1]++;
81                        gistogramm( *(SFSNode**)(node->data), gist );
82                } else
83                       gist[0]++;
84        } else {
85                SFSNodeData *data;
86
87                gist[ node->nchild ]++;
88
89                data=node->data;
90                while(data - node->data < node->nchar) {
91                        if ( data->haschild )
92                                gistogramm( *(SFSNode**)( ((char*)data) + data->child ), gist);
93                        data++;
94                }
95         }
96 }
97
98 int
99 main(int argn, char *argv[]) {
100         struct timeval begin,end;
101         char *datafile=NULL;
102         int i, binary=0;
103         FILE *in;
104         char buf[4096];
105         double sumtime=0.0;
106         int structsize=0, list=0, prefix=0, range=0, enumerate=0,gmode=0;
107         char *addondatafile=NULL;
108
109         int len=4096, curlen=0, checked=0, success=0;
110         char **data;
111         void *res;
112         
113         while ((i = getopt(argn, argv, "pd:bqha:lrng")) != EOF) {
114                 switch(i) {
115                         case 'a':
116                                 addondatafile=optarg;
117                                 break;
118                         case 'd':
119                                 datafile=optarg;
120                                 break;
121                         case 'n':
122                                 enumerate=1;
123                                 break;
124                         case 'g':
125                                 gmode=1;
126                                 break;
127                         case 'r':
128                                 range=1;
129                                 break;
130                         case 'p':
131                                 prefix=1;
132                                 break;
133                         case 'l':
134                                 list=1;
135                                 break;
136                         case 'b':
137                                 binary=1;
138                                 break;
139                         case 'q':
140                                 verbose=0;
141                                 break;
142                         case 'h':
143                         default: 
144                                 usage();
145                 }
146         }
147
148         if (!datafile) usage();
149
150         opentlog(TL_OPEN_STDERR|TL_OPEN_SYSLOG|TL_OPEN_FILE, TL_INFO, "./sfxtest.log");
151         if ( (in=fopen(datafile,"r"))==NULL ) 
152                 tlog(TL_CRIT|TL_EXIT,"Beda with %s", datafile);
153
154         data=(char**)tmalloc(len*sizeof(char*));
155         while(fgets(buf,4096,in)) {
156                 structsize+=clrspace(buf)+1;
157                 if ( !*buf ) continue;
158                 if (curlen+2 > len) {
159                         len*=2;
160                         data=(char**)trealloc(data, len*sizeof(char*));
161                 }
162                 data[curlen]=tstrdup(buf);
163                 curlen++;
164         }
165         fclose(in);
166         data[curlen]=NULL;
167         structsize+=sizeof(char*)*curlen;
168
169         if ( binary == 1 ) {
170                 gettimeofday( &begin, NULL );
171                 qsort(data, curlen, sizeof(char**), STRINGCMP);
172                 tlog(TL_INFO,"Init time: %.03f secs",  elapsedtime(&begin) );
173                 tlog(TL_INFO,"Memory allocated: %.2fMb", ((float)structsize)/(1024.0*1024.0));
174
175                 gettimeofday( &begin, NULL );
176                 while(fgets(buf,4096,stdin)) {
177                         char *ptr=buf;
178                         len = clrspace(buf);
179                         if (!len) continue;
180                         res = bsearch(&ptr, data, curlen, sizeof(char**), STRINGCMP);
181                         
182                         if (verbose) 
183                                 puts( (res) ? "Y" : "N" );
184
185                         checked++;
186                         if (res) success++;
187                 }
188                 gettimeofday( &end, NULL );
189         } else {
190                 SFSTree info;
191                 SFSDataIO n = {NULL,0,NULL};
192                 n.data = (void*)&enumerate;
193
194                 gettimeofday( &begin, NULL );
195                 if (enumerate) {
196                         char **ptr=data;
197
198                         SFSInit_dp(&info,sizeof(enumerate),NULL);
199                         while(*ptr) {
200                                 n.key = *ptr;
201                                 SFSAdd(&info, &n);
202                                 enumerate++;
203                                 ptr++;
204                         }
205                 } else
206                         SFSInit_c(&info,data);
207                 tlog(TL_INFO,"Init time: %.03f secs",  elapsedtime(&begin) );
208                 tlog(TL_INFO,"Memory allocated: %.2fMb", ((float)info.totalen)/(1024.0*1024.0));
209                 tlog(TL_INFO,"Number of nodes: %d", info.nnodes);
210
211                 for(i=0;i<curlen;i++)
212                         tfree(data[i]);
213                 tfree(data);
214
215                 if ( addondatafile ) {
216                         int cntadd=0;
217
218                         if ( (in=fopen(addondatafile,"r"))==NULL ) 
219                                 tlog(TL_CRIT|TL_EXIT,"Beda with %s", addondatafile);
220                         n.key=buf;
221
222                         gettimeofday( &begin, NULL );
223                         while(fgets(buf,4096,in)) {
224                                 if ( !clrspace(buf) ) continue;
225                                 n.keylen=0;
226                                 SFSAdd(&info, &n);
227                                 if (enumerate) enumerate++;
228                                 cntadd++;
229                         }
230                         gettimeofday( &end, NULL );
231                         fclose(in);
232                         sumtime = timediff(&begin,&end);
233                         tlog(TL_INFO,"Add time: %.03f secs,   %.03f per second", sumtime, ((double)cntadd)/sumtime);
234                         tlog(TL_INFO,"Added item: %d", cntadd);
235                         tlog(TL_INFO,"Memory allocated: %.2fMb", ((float)info.totalen)/(1024.0*1024.0));
236                         tlog(TL_INFO,"Number of nodes: %d", info.nnodes);
237                         curlen+=cntadd;
238                 }
239
240                 if (list) {
241                         SFSDataIO out;
242
243                         gettimeofday( &begin, NULL );
244                         SFSIteratorStart(&info);
245                         while( SFSIterate(&info, &out) ) { 
246                                 if ( verbose ) {
247                                         if (enumerate) {
248                                                 fputs(out.key, stdout);
249                                                 printf(" %d\n", *(int*)(out.data));
250                                         } else
251                                                 puts(out.key);
252                                 }
253                                 checked++; success++;
254                         }
255                         gettimeofday( &end, NULL );
256                 } else if ( gmode ) {
257                         int Gist[256];
258    
259                         memset(Gist,0,sizeof(int)*256);
260                         gettimeofday( &begin, NULL );
261                         gistogramm(info.node, Gist);
262                         gettimeofday( &end, NULL );
263                         for(i=0;i<256;i++)
264                                 printf("%d\t%d\n",i,Gist[i]);
265                 } else if ( prefix ) {
266                         SFSDataIO out;
267                         
268                         gettimeofday( &begin, NULL );
269                         while(fgets(buf,4096,stdin)) {
270                                 len = clrspace(buf);
271                                 if (!len) continue;
272
273                                 SFSPrefixIteratorStart(&info,buf);
274                                 while( SFSIterate(&info, &out)) { 
275                                         if (verbose) { 
276                                                 if (enumerate) {
277                                                         putchar('>'); 
278                                                         fputs(out.key, stdout);
279                                                         printf(" %d\n", *(int*)(out.data));
280                                                 } else {
281                                                         putchar('>'); 
282                                                         puts(out.key);
283                                                 }
284                                         }
285                                         success++;
286                                 }       
287                                 checked++;
288                         }
289                         gettimeofday( &end, NULL );
290                 } else if ( range ) {
291                         SFSDataIO f,l;
292                         
293                         gettimeofday( &begin, NULL );
294                         while(fgets(buf,4096,stdin)) {
295                                 len = clrspace(buf);
296                                 if (!len) continue;
297
298                                 if ( SFSRange(&info,buf,&f,&l) ) {
299                                         if (verbose) { 
300                                                 if (enumerate) {
301                                                         putchar('>'); 
302                                                         fputs(f.key, stdout);
303                                                         printf(" %d\n", *(int*)(f.data));
304                                                         putchar('>'); 
305                                                         fputs(l.key, stdout);
306                                                         printf(" %d\n", *(int*)(l.data));
307                                                 } else {
308                                                         putchar('>'); puts(f.key);
309                                                         putchar('>'); puts(l.key);
310                                                 }
311                                         }
312                                         success++;
313                                 }       
314                                 checked++;
315                         }
316                         gettimeofday( &end, NULL );
317                 } else {
318                         gettimeofday( &begin, NULL );
319                         while(fgets(buf,4096,stdin)) {
320                                 len = clrspace(buf);
321                                 if (!len) continue;
322
323                                 res = SFSFindData(&info,buf);
324                                 if (verbose) {
325                                         if (enumerate && res)
326                                                 printf("%d\n", *(int*)(res));
327                                         else 
328                                                 puts( (res) ? "Y" : "N" );
329                                 }
330
331                                 checked++;
332                                 if (res) success++;
333                         }
334                         gettimeofday( &end, NULL );
335                 }
336                 SFSFree(&info,NULL);
337         }
338
339         sumtime = timediff(&begin,&end);
340         tlog(TL_INFO,"Total execution time: %.03f secs", sumtime);
341         tlog(TL_INFO,"Total words in data: %d; Checked: %d; Success: %d", curlen, checked, success);
342         tlog(TL_INFO,"%.2f words per second", ((double)checked)/sumtime);
343         
344         closetlog();
345         return 0;
346 }