/* * Copyright (c) 2006 Teodor Sigaev * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include /* * Utility for filtering lex file from stop words */ #define TXTBUFLEN 4096 typedef struct { int len; char **stop; } StopList; static int comparestr(const void *a, const void *b) { return strcasecmp(*(char **) a, *(char **) b); } static void readstoplist(char *filename, StopList * s) { char **stop = NULL; FILE *hin; char buf[TXTBUFLEN]; int reallen = 0; s->len = 0; if ((hin = fopen(filename, "r")) == NULL) { fprintf(stderr,"Can't open %s: %s\n", filename, strerror(errno)); exit(1); } while (fgets(buf, TXTBUFLEN, hin)) { buf[strlen(buf) - 1] = '\0'; if (*buf == '\0') continue; if (s->len >= reallen) { char **tmp; reallen = (reallen) ? reallen * 2 : 16; tmp = (char **) realloc((void *) stop, sizeof(char *) * reallen); if (!tmp) { fprintf(stderr,"Not enough memory\n"); exit(1); } stop = tmp; } stop[s->len] = strdup(buf); if (!stop[s->len]) { fprintf(stderr,"Not enough memory\n"); exit(1); } (s->len)++; } fclose(hin); s->stop = stop; if (s->stop && s->len > 1) qsort(s->stop, s->len, sizeof(char *), comparestr); } static int searchstoplist(StopList * s, char *key) { if ( strlen(key) <=4 ) return 1; return (s->stop && s->len > 0 && bsearch(&key, s->stop, s->len, sizeof(char *), comparestr)) ? 1 : 0; } int main(int argn, char *argv[]) { char buf[TXTBUFLEN]; StopList sl={0,NULL}; if ( argn != 2 ) { fprintf(stderr,"Usage: %s stopfile < lex\n", argv[0]); exit(1); } readstoplist(argv[1], &sl); while( fgets(buf, TXTBUFLEN, stdin) ) { char wrd[TXTBUFLEN]; int occur; if ( sscanf( buf, "%s %d", wrd, &occur )!= 2) continue; if ( searchstoplist(&sl, wrd) || occur <=0 ) continue; printf("%s %d\n", wrd, occur); } return 0; }