/* * Copyright (c) 2004 Teodor Sigaev * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include static void pushoutstr(FILE *out, char *buf, int len); static void usage() { printf("Clear text from lib.ru for Pocket PC, Version 0.2.\n"); printf("Copyright (c) 2004 Teodor Sigaev \n"); printf(" All rights reserved.\n"); printf("Usage:\n clrlibru [-i INPUTFILE] [-o OUTPUT] [-l NUMSPACE]\n"); exit(0); } static char* RemoveTag[]={ "select", "head", "div", "a", "form", "script", "style", "title", NULL }; static int is_rtag(char *tag, int len) { char **ptr=RemoveTag; tag[len]='\0'; while( *ptr ) { if ( strcmp(tag, *ptr) == 0 ) return 1; ptr++; } return 0; } typedef struct { char *quote; char *str; int len; } Quote; static Quote quoteChange[] = { {"quot", "\"", -1}, {"laquo", "\"", -1}, {"raquo", "\"", -1}, {"quot", "\"", -1}, {"lt", "<", -1}, {"gt", ">", -1}, {"nbsp", " ", -1}, {"mdash", "-", -1}, {"amp", "&", -1}, {"shy", "", -1}, {NULL, NULL, -1} }; static int pushoutquot(FILE *out, char *buf, int buflen) { Quote *ptr = quoteChange; buf[buflen]='\0'; while( ptr->quote ) { if ( strcmp( ptr->quote, buf ) == 0 ) { if ( ptr->len < 0 ) ptr->len = strlen( ptr->str ); pushoutstr( out, ptr->str, ptr->len ); return 1; } ptr++; } return 0; } static char *optarg = NULL; static int current=1; int mgetopt(int argn, char* argv[], char *option) { char key; if ( current >= argn ) return -1; key = *(argv[current]+1); if ( *(argv[current]) == '-' || *(argv[current]) == '/' ) { char *ptr = strchr( option, key ); if ( ptr == NULL ) { printf( "Unknown option: %s\n", argv[current]); exit(1); } current++; if ( *(ptr+1) == ':' ) { if (current < argn) { optarg=argv[current]; current++; } else { printf("No value for -%c\n" ,key); exit(1); } } else optarg=NULL; } else { printf("Unknown option: %s\n", argv[current]); exit(1); } return (int)key; } #define INTXT 0 #define INTAG 1 #define FINDEND 2 #define INDROPTAG 3 #define INDROPINTAG 4 #define INHEADTAG 5 #define INDROPINCLSTAG 6 #define WAITAFTERRED 7 #define COMMENTBEGIN1 8 #define COMMENTBEGIN2 9 #define COMMENTIN 10 #define COMMENTEND1 11 #define COMMENTEND2 12 #define INQUOTE 13 #define BUFFERLENGTH 8192 #define REDSTRING " " typedef enum TypeOut { Char, NewLine, Tag, Paragraph, None } TypeOut; static void pushout( FILE *out, TypeOut type, int value ) { static TypeOut PrevType=None; static int prevvalue=0; static int newlinecount=0; if ( type == Char ) { if ( !(( PrevType==Tag || PrevType==None || PrevType==NewLine ) && ( value == ' ' || value == '\t' )) ) { newlinecount=0; fputc(value ,out); } prevvalue = value; } else if ( type == NewLine ) { if ( newlinecount < 2 ) fputc('\n', out); newlinecount++; } else if ( type != PrevType ) { switch(type) { case Tag: if ( !(PrevType==None || PrevType==NewLine || ( PrevType==Char && ( prevvalue == ' ' || prevvalue == '\t' || prevvalue == '\n' ) )) ) { newlinecount=0; fputc(' ', out); } break; case Paragraph: pushout(out, NewLine, 0); fwrite(REDSTRING, sizeof(char), strlen(REDSTRING), out); newlinecount=0; break; default: printf("Unknown type: %d", type); exit(1); } } PrevType = type; } static void pushoutstr(FILE *out, char *buf, int len) { char *ptr=buf; while( ptr-buf= BUFFERLENGTH ) { printf("-l should be >= 0 and < %d\n", BUFFERLENGTH); exit(1); } break; case 'h': case '?': default: usage(); } } while( (ch=getc(in)) != EOF ) { if ( ch == '\r' ) continue; if ( state==INTXT ) { if ( ch == '<' ) { state=INHEADTAG; lenbuf=0; } else if ( ch == '\n' ) { state=FINDEND; lenbuf=1; } else if ( ch=='&' ) { *buf='&'; lenbuf=1; state=INQUOTE; } else { pushout(out, Char, ch); } } else if ( state==INHEADTAG ) { if ( isalpha(ch) ) { if ( lenbuf < BUFFERLENGTH-1 ) { buf[ lenbuf ] = tolower(ch); lenbuf++; } } else if ( ch == '!' ) { state = COMMENTBEGIN1; } else if ( ch == '>' ) { if ( is_rtag(buf,lenbuf) ) { state = INDROPTAG; closelen=0; } else { state=INTXT; if ( lenbuf==0 ) pushoutstr(out, "<>", 2); else pushout(out, Tag, 0); } } else if ( lenbuf == 0 && ch != '/' ) { pushout(out, Char, '<'); pushout(out, Char, ch); state=INTXT; } else { if ( is_rtag(buf,lenbuf) ) { state = INDROPTAG; closelen=0; } else { state=INTAG; } } } else if ( state==INTAG ) { if ( ch == '>' ) { state=INTXT; pushout(out, Tag, 0); } } else if ( state == INDROPTAG ) { if ( ch == '<' ) { state=INDROPINTAG; closelen=0; } } else if ( state == INDROPINTAG ) { if ( ch == '/' ) state=INDROPINCLSTAG; else state=INDROPTAG; } else if ( state == INDROPINCLSTAG ) { if ( isalpha(ch) ) { if ( closelen < lenbuf && tolower(ch) == buf[closelen] ) { closelen++; if ( closelen==lenbuf ) state=INTAG; } else state=INDROPTAG; } else state=INDROPTAG; } else if ( state==FINDEND ) { if ( ch == ' ' || ch == '\t' ) { lenbuf++; if ( lenbuf > spacelen ) { pushout( out, Paragraph, 0 ); state=WAITAFTERRED; } } else if ( ch=='\n' ) { pushout( out, NewLine, 0 ); pushout( out, NewLine, 0 ); lenbuf++; } else { state=INTXT; pushout(out, Char, ' '); ungetc(ch,in); } } else if ( state==WAITAFTERRED ) { if ( !isspace(ch) ) { ungetc(ch,in); state=INTXT; } } else if ( state==COMMENTBEGIN1 ) { if ( ch == '-' ) { state = COMMENTBEGIN2; } else { pushoutstr(out, "' ) state = INTXT; else if ( ch != '-' ) state = COMMENTIN; } else if ( state==INQUOTE ) { if ( isalpha( ch ) && lenbuf < BUFFERLENGTH-2 ) { buf[ lenbuf ] = ch; lenbuf++; } else if ( ch == ';' && lenbuf>1 && pushoutquot( out, buf+1, lenbuf-1 ) ) { state = INTXT; } else { pushoutstr(out, buf, lenbuf); state = INTXT; ungetc(ch,in); } } else { printf("Unknown state: %d\n", state); exit(1); } } if ( in!=stdin ) fclose(in); if ( out!=stdout ) fclose(out); return 0; }