1 /* This program generates documents of random lengths with random words */
2 /* using the frequencies of words and document lengths obtained from */
3 /* the Wall Street Journal(1985 - 1990) in order to create databases */
4 /* with similar stastitical properties to real text. Databases can be */
5 /* created in mg or atlas format. */
7 /* Author: D. Psaradellis, December 1994 */
9 /* Modified by J. Zobel, July 1995. */
13 * Copyright RMIT and The University of Melbourne. This code may not be
14 * further distributed without the permission of J. Zobel.
18 * Cleanup and nonsignificant changes by teodor <teodor@sigaev.ru>
28 #define INITIAL_SEED 73802 /* initialise seed */
29 #define linelen 78 /* line length */
33 char *word; /* word */
34 int freq; /* word's frequency */
35 int cum_freq; /* cummulative freq - sum of previous
36 * freq and cum_freq */
37 short word_len; /* length of word in bytes */
41 int doc_len;/* document length */
42 int doc_freq; /* document frequency */
43 int doc_cfreq; /* cummulative frequency */
46 static struct word_info *a;
47 static struct doc_info *d;
48 static int no_of_words = 0;
49 static int word_occur = 0;
50 static int no_of_docs = 0;
51 static int doc_occur = 0;
52 static char buf [BUFLEN];
53 static int isInited = 0;
55 /* This function calculates the no. of words/doclens in the file, */
56 /* by counting the no. of newlines */
58 no_newline(char *filename)
63 if ((fp = fopen(filename, "r")) == NULL)
64 fatal("Cannot open %s\n", filename);
66 while ((c = getc(fp)) != EOF)
73 /* This function dynamically allocates memory to store the words, their */
74 /* frequency, their cummulative frequency and their length in bytes */
76 build_word_array(char *filename)
79 int i , temp_cfreq = 0;
81 if ((fp = fopen(filename, "r")) == NULL)
82 fatal("Cannot open %s\n", filename);
84 a = (struct word_info *)malloc(no_of_words * sizeof(struct word_info));
86 fatal("Can't allocate %d bytes\n", no_of_words * sizeof(struct word_info));
88 for (i = 0; i < no_of_words; i++) {
89 fscanf(fp, "%s", buf); /* store word in temporary buffer */
90 a[i].word = strdup(buf);
92 fatal("strdup failed\n");
93 fscanf(fp, "%d", &a[i].freq);
94 a[i].word_len = strlen(a[i].word);
95 a[i].cum_freq = temp_cfreq;
96 temp_cfreq += a[i].freq;
102 /* This function dynamically allocates memory to store the document lengths, */
103 /* their frequency and their cummulative frequency */
105 build_doc_array(char *filename)
108 int i , temp_cfreq = 0;
110 if ((fp = fopen(filename, "r")) == NULL)
111 fatal("Cannot open %s\n", filename);
113 d = (struct doc_info *)malloc(no_of_docs * sizeof(struct doc_info));
115 fatal("Can't allocate %d bytes\n", no_of_docs * sizeof(struct doc_info));
116 for (i = 0; i < no_of_docs; i++) {
117 fscanf(fp, "%d", &d[i].doc_len);
118 fscanf(fp, "%d", &d[i].doc_freq);
119 d[i].doc_cfreq = temp_cfreq;
120 temp_cfreq += d[i].doc_freq;
126 /* to locate the index of the word required */
128 binsearch_word(int v)
136 if (v < a[x].cum_freq)
138 else if (v >= a[x].cum_freq + a[x].freq)
147 /* This function outputs a specified no. of random words taking into */
148 /* account the linelen */
150 output_words(StringBuf *b, int words_remain)
152 int index , linesize = 0;
154 while (words_remain > 0) { /* generate random no. in range of */
155 index = binsearch_word(rnd() % word_occur); /* 0 - word occurrences */
156 if ((a[index].word_len + linesize) > linelen) { /* if len of line
161 sb_add(b, a[index].word, a[index].word_len);
163 linesize += (a[index].word_len + 1);
172 int index = binsearch_word(rnd() % word_occur);
173 return a[index].word;
177 /* This function uses a binary search algorithm on the doc_cfreq field */
178 /* to locate the index of the document length required */
188 if (v < d[x].doc_cfreq)
190 else if (v >= d[x].doc_cfreq + d[x].doc_freq)
199 /* This function uses a binary search algorithm on the cum_freq field */
201 generate_doc(StringBuf *b) {
205 index = binsearch_doc(rnd() % doc_occur);
206 output_words( b, d[index].doc_len );
211 generate_querywords() {
216 index = binsearch_doc(rnd() % doc_occur);
217 res = (char**) malloc(sizeof(char*) * (d[index].doc_len+1));
219 for(i=0;i<d[index].doc_len;i++)
220 res[i] = get_words();
228 finnegan_init(char *lex_file, char *doc_file, int quiet) {
230 fatal("finnegan is already inited\n");
233 printf("Initialize text generator with:\n");
234 printf("\tfile '%s' - lexeme's distribution\n", lex_file);
235 printf("\tfile '%s' - length's distribution\n", doc_file);
238 no_of_words = no_newline(lex_file);
239 no_of_docs = no_newline(doc_file);
240 word_occur = build_word_array(lex_file);
241 doc_occur = build_doc_array(doc_file);
244 /* output_docs(doc_occur, word_occur); */