X-Git-Url: http://www.sigaev.ru/git/gitweb.cgi?p=clrlibru.git;a=blobdiff_plain;f=clrlibru.c;h=76b44f9ae9956eb440d4d86f8916e2144b854a1a;hp=13b25b681d1722a1a6309c198367579b4434dce1;hb=HEAD;hpb=d13f66d9af94b45a66a9f46c7a460b862325d858 diff --git a/clrlibru.c b/clrlibru.c index 13b25b6..76b44f9 100644 --- a/clrlibru.c +++ b/clrlibru.c @@ -1,22 +1,57 @@ +/* + * Copyright (c) 2004 Teodor Sigaev + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + #include #include #include #include +static void pushoutstr(FILE *out, char *buf, int len); + static void usage() { printf("Clear text from lib.ru for Pocket PC, Version 0.2.\n"); - printf("Author: Teodor Sigaev \n"); + printf("Copyright (c) 2004 Teodor Sigaev \n"); + printf(" All rights reserved.\n"); printf("Usage:\n clrlibru [-i INPUTFILE] [-o OUTPUT] [-l NUMSPACE]\n"); exit(0); } -char* RemoveTag[]={ +static char* RemoveTag[]={ "select", "head", "div", "a", "form", + "script", + "style", + "title", NULL }; @@ -33,6 +68,43 @@ is_rtag(char *tag, int len) { return 0; } +typedef struct { + char *quote; + char *str; + int len; +} Quote; + +static Quote quoteChange[] = { + {"quot", "\"", -1}, + {"laquo", "\"", -1}, + {"raquo", "\"", -1}, + {"quot", "\"", -1}, + {"lt", "<", -1}, + {"gt", ">", -1}, + {"nbsp", " ", -1}, + {"mdash", "-", -1}, + {"amp", "&", -1}, + {"shy", "", -1}, + {NULL, NULL, -1} +}; + +static int +pushoutquot(FILE *out, char *buf, int buflen) { + Quote *ptr = quoteChange; + + buf[buflen]='\0'; + while( ptr->quote ) { + if ( strcmp( ptr->quote, buf ) == 0 ) { + if ( ptr->len < 0 ) + ptr->len = strlen( ptr->str ); + pushoutstr( out, ptr->str, ptr->len ); + return 1; + } + ptr++; + } + return 0; +} + static char *optarg = NULL; static int current=1; @@ -68,21 +140,85 @@ mgetopt(int argn, char* argv[], char *option) { return (int)key; } -#define INTXT 0 -#define INTAG 1 -#define FINDEND 2 +#define INTXT 0 +#define INTAG 1 +#define FINDEND 2 #define INDROPTAG 3 #define INDROPINTAG 4 #define INHEADTAG 5 #define INDROPINCLSTAG 6 +#define WAITAFTERRED 7 +#define COMMENTBEGIN1 8 +#define COMMENTBEGIN2 9 +#define COMMENTIN 10 +#define COMMENTEND1 11 +#define COMMENTEND2 12 +#define INQUOTE 13 + +#define BUFFERLENGTH 8192 +#define REDSTRING " " + +typedef enum TypeOut { + Char, + NewLine, + Tag, + Paragraph, + None +} TypeOut; + +static void +pushout( FILE *out, TypeOut type, int value ) { + static TypeOut PrevType=None; + static int prevvalue=0; + static int newlinecount=0; + + if ( type == Char ) { + if ( !(( PrevType==Tag || PrevType==None || PrevType==NewLine ) && ( value == ' ' || value == '\t' )) ) { + newlinecount=0; + fputc(value ,out); + } + prevvalue = value; + } else if ( type == NewLine ) { + if ( newlinecount < 2 ) + fputc('\n', out); + newlinecount++; + } else if ( type != PrevType ) { + switch(type) { + case Tag: + if ( !(PrevType==None || PrevType==NewLine || ( PrevType==Char && ( prevvalue == ' ' || prevvalue == '\t' || prevvalue == '\n' ) )) ) { + newlinecount=0; + fputc(' ', out); + } + break; + case Paragraph: + pushout(out, NewLine, 0); + fwrite(REDSTRING, sizeof(char), strlen(REDSTRING), out); + newlinecount=0; + break; + default: + printf("Unknown type: %d", type); + exit(1); + } + } + PrevType = type; +} + +static void +pushoutstr(FILE *out, char *buf, int len) { + char *ptr=buf; + while( ptr-buf= 0\n"); + if ( spacelen < 0 || spacelen >= BUFFERLENGTH ) { + printf("-l should be >= 0 and < %d\n", BUFFERLENGTH); exit(1); } break; @@ -115,6 +251,9 @@ main(int argn, char *argv[]) { } while( (ch=getc(in)) != EOF ) { + if ( ch == '\r' ) + continue; + if ( state==INTXT ) { if ( ch == '<' ) { state=INHEADTAG; @@ -122,23 +261,35 @@ main(int argn, char *argv[]) { } else if ( ch == '\n' ) { state=FINDEND; lenbuf=1; - *buf = ch; - } else if ( ch != '\r' ) - fputc(ch,out); + } else if ( ch=='&' ) { + *buf='&'; + lenbuf=1; + state=INQUOTE; + } else { + pushout(out, Char, ch); + } } else if ( state==INHEADTAG ) { if ( isalpha(ch) ) { - buf[ lenbuf ] = tolower(ch); - lenbuf++; + if ( lenbuf < BUFFERLENGTH-1 ) { + buf[ lenbuf ] = tolower(ch); + lenbuf++; + } + } else if ( ch == '!' ) { + state = COMMENTBEGIN1; } else if ( ch == '>' ) { if ( is_rtag(buf,lenbuf) ) { state = INDROPTAG; closelen=0; } else { state=INTXT; - fputc(' ',out); + if ( lenbuf==0 ) + pushoutstr(out, "<>", 2); + else + pushout(out, Tag, 0); } } else if ( lenbuf == 0 && ch != '/' ) { - fputc('<',out); fputc(ch,out); + pushout(out, Char, '<'); + pushout(out, Char, ch); state=INTXT; } else { if ( is_rtag(buf,lenbuf) ) { @@ -146,14 +297,13 @@ main(int argn, char *argv[]) { closelen=0; } else { state=INTAG; - fputc(' ',out); } } } else if ( state==INTAG ) { if ( ch == '>' ) { state=INTXT; - fputc(' ',out); - } + pushout(out, Tag, 0); + } } else if ( state == INDROPTAG ) { if ( ch == '<' ) { state=INDROPINTAG; @@ -175,28 +325,68 @@ main(int argn, char *argv[]) { } else state=INDROPTAG; } else if ( state==FINDEND ) { - if ( ch == ' ' ) { - buf[ lenbuf ] = ch; + if ( ch == ' ' || ch == '\t' ) { lenbuf++; if ( lenbuf > spacelen ) { - fwrite(buf, sizeof(char), lenbuf, out); - state=INTXT; + pushout( out, Paragraph, 0 ); + state=WAITAFTERRED; } } else if ( ch=='\n' ) { - buf[ lenbuf ] = ch; + pushout( out, NewLine, 0 ); + pushout( out, NewLine, 0 ); lenbuf++; - fwrite(buf, sizeof(char), lenbuf, out); + } else { state=INTXT; - } else if ( ch !='\r' ) { + pushout(out, Char, ' '); + ungetc(ch,in); + } + } else if ( state==WAITAFTERRED ) { + if ( !isspace(ch) ) { + ungetc(ch,in); state=INTXT; - fputc(' ',out); + } + } else if ( state==COMMENTBEGIN1 ) { + if ( ch == '-' ) { + state = COMMENTBEGIN2; + } else { + pushoutstr(out, "' ) + state = INTXT; + else if ( ch != '-' ) + state = COMMENTIN; + } else if ( state==INQUOTE ) { + if ( isalpha( ch ) && lenbuf < BUFFERLENGTH-2 ) { + buf[ lenbuf ] = ch; + lenbuf++; + } else if ( ch == ';' && lenbuf>1 && pushoutquot( out, buf+1, lenbuf-1 ) ) { + state = INTXT; + } else { + pushoutstr(out, buf, lenbuf); + state = INTXT; ungetc(ch,in); - } + } } else { printf("Unknown state: %d\n", state); exit(1); } - } + } if ( in!=stdin ) fclose(in);