/* DP4 version 4.0000 0000000000800000000000 $Log: convhhc.c $ Revision 1.4 2003/10/17 17:40:56Z Alun Added DTD to .htm output Revision 1.3 2003/10/15 13:31:45Z Alun Browse files want backslashes in urls */ #include #include #include #include #define _CTUPPER 1 #define _CTLOWER 2 #define _CTDIGIT 4 #define _CTSPACE 8 #define _CTPUNCT 16 #define _CTCTRL 32 #define _CTHEX 64 #define _CTKANJI 128 struct CDATA { char ctype[UCHAR_MAX+1]; char ucase_xlat_table[UCHAR_MAX+1]; char lcase_xlat_table[UCHAR_MAX+1]; }; #define is_ucase(c) (cdata.ctype[(unsigned char) (c)] & _CTUPPER) #define is_lcase(c) (cdata.ctype[(unsigned char) (c)] & _CTLOWER) #define is_letter(c) (cdata.ctype[(unsigned char) (c)] & _CTUPPER + _CTLOWER) #define is_digit(c) (cdata.ctype[(unsigned char) (c)] & _CTDIGIT) #define is_xdigit(c) (cdata.ctype[(unsigned char) (c)] & _CTDIGIT+_CTHEX) #define is_space(c) (cdata.ctype[(unsigned char) (c)] & _CTSPACE) #define is_punct(c) (cdata.ctype[(unsigned char) (c)] & _CTPUNCT) #define is_alnum(c) (cdata.ctype[(unsigned char) (c)] & _CTUPPER+_CTLOWER+_CTDIGIT) #define is_ctrl(c) (cdata.ctype[(unsigned char) (c)] & _CTCTRL) #define is_graph(c) (cdata.ctype[(unsigned char) (c)] & _CTUPPER+_CTLOWER+_CTDIGIT+_CTPUNCT) #define is_kanji1(c) (cdata.ctype[(unsigned char) (c)] & _CTKANJI) #define to_ucase(c) (cdata.ucase_xlat_table[(unsigned char) c]) #define to_lcase(c) (cdata.lcase_xlat_table[(unsigned char) c]) #pragma warning(disable:4305) static struct CDATA cdata = { { _CTCTRL,_CTCTRL,_CTCTRL,_CTCTRL,_CTCTRL,_CTCTRL,_CTCTRL,_CTCTRL, _CTCTRL,_CTCTRL|_CTSPACE,_CTCTRL|_CTSPACE,_CTCTRL|_CTSPACE,_CTCTRL|_CTSPACE,_CTCTRL|_CTSPACE,_CTCTRL,_CTCTRL, _CTCTRL,_CTCTRL,_CTCTRL,_CTCTRL,_CTCTRL,_CTCTRL,_CTCTRL,_CTCTRL, _CTCTRL,_CTCTRL,_CTCTRL,_CTCTRL,_CTCTRL,_CTCTRL,_CTCTRL,_CTCTRL, _CTSPACE,_CTPUNCT,_CTPUNCT,_CTPUNCT,_CTPUNCT,_CTPUNCT,_CTPUNCT,_CTPUNCT, _CTPUNCT,_CTPUNCT,_CTPUNCT,_CTPUNCT,_CTPUNCT,_CTPUNCT,_CTPUNCT,_CTPUNCT, _CTDIGIT,_CTDIGIT,_CTDIGIT,_CTDIGIT,_CTDIGIT,_CTDIGIT,_CTDIGIT,_CTDIGIT, _CTDIGIT,_CTDIGIT,_CTPUNCT,_CTPUNCT,_CTPUNCT,_CTPUNCT,_CTPUNCT,_CTPUNCT, _CTPUNCT,_CTUPPER|_CTHEX,_CTUPPER|_CTHEX,_CTUPPER|_CTHEX,_CTUPPER|_CTHEX,_CTUPPER|_CTHEX,_CTUPPER|_CTHEX,_CTUPPER, _CTUPPER,_CTUPPER,_CTUPPER,_CTUPPER,_CTUPPER,_CTUPPER,_CTUPPER,_CTUPPER, _CTUPPER,_CTUPPER,_CTUPPER,_CTUPPER,_CTUPPER,_CTUPPER,_CTUPPER,_CTUPPER, _CTUPPER,_CTUPPER,_CTUPPER,_CTPUNCT,_CTPUNCT,_CTPUNCT,_CTPUNCT,_CTPUNCT, _CTPUNCT,_CTLOWER|_CTHEX,_CTLOWER|_CTHEX,_CTLOWER|_CTHEX,_CTLOWER|_CTHEX,_CTLOWER|_CTHEX,_CTLOWER|_CTHEX,_CTLOWER, _CTLOWER,_CTLOWER,_CTLOWER,_CTLOWER,_CTLOWER,_CTLOWER,_CTLOWER,_CTLOWER, _CTLOWER,_CTLOWER,_CTLOWER,_CTLOWER,_CTLOWER,_CTLOWER,_CTLOWER,_CTLOWER, _CTLOWER,_CTLOWER,_CTLOWER,_CTPUNCT,_CTPUNCT,_CTPUNCT,_CTPUNCT,_CTCTRL, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0 }, { 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f, 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27, 0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f, 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f, 0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47, 0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f, 0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57, 0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f, 0x60,'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',0x7b,0x7c,0x7d,0x7e,0x7f, 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f, 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f, 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf, 0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf, 0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7, 0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf, 0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7, 0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf, 0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7, 0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef, 0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7, 0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff }, { 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f, 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27, 0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f, 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f, 0x40,'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',0x5b,0x5c,0x5d,0x5e,0x5f, 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f, 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f, 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f, 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f, 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf, 0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf, 0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7, 0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf, 0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7, 0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf, 0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7, 0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef, 0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7, 0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff } }; /**/ static char * str_lcase(char *string) { register unsigned char * s = (unsigned char *) string; while (*s) { *s = to_lcase(*s); s++; } return string; } #define NR_ELEMENTS(a) (sizeof(a)/sizeof(a[0])) #define DTD3_2 1 /* Tag in HTML3.2 */ #define DEPRECATED 2 /* Tag deprecated in latest DTD supported*/ #define DTD4 4 /* Tag in HTML4.0 */ #define IE3 8 /* Tag in IE3 */ #define IE4 16 /* Tag in IE4 */ #define IE5 32 /* Tag in IE5 */ #define NS4 64 /* Tag in NS4 */ #define IE (IE3|IE4|IE5) #define ALL (DTD3_2|DTD4|IE|NS4) #define TRANSITIONAL (DTD4|DTD3_2|DEPRECATED|IE|NS4) #define STRICT (DTD4|NS4|IE4|IE5) #define OBSOLETE (DTD3_2|IE|NS4|DEPRECATED) /* flags for get_text() function */ #define GW_TAG 1 #define GW_ATTRIBUTE 2 #define GW_VALUE 4 #define GW_REFORMAT 8 #define GW_TEXT 16 #define GW_ESCAPE 32 #define GW_CURRENT 64 #define GW_LT 128 /* These are all the elements IE5 and Netscape 4.x know about. This is a super-set of HTML 4.0 */ struct { const char *name; int flags; } elements[] = { "!doctype",ALL, "a",ALL, "abbr",STRICT & ~IE, "acronym",STRICT, "address",ALL, "applet",TRANSITIONAL, "area",ALL, "b",ALL, "base",ALL, "basefont",TRANSITIONAL, "bdo",STRICT & ~IE4, "bgsound",IE, "big",ALL, "blink",NS4|IE,/*IE recognises this as a tag, but it does not actually blink*/ "blockquote",ALL, "body",ALL, "br",ALL, "button",STRICT&~NS4, "caption",ALL, "center",TRANSITIONAL, "cite",ALL, "code",ALL, "col",ALL, "colgroup", STRICT|IE3, "comment", IE, "dd",ALL, "del",STRICT & ~NS4, "dfn",ALL & ~NS4, "dir",TRANSITIONAL, "div",ALL, "dl",ALL, "dt",ALL, "em",ALL, "embed",ALL, "fieldset",STRICT, "font",TRANSITIONAL, "form",ALL, "frame",STRICT|IE3, "frameset",STRICT|IE3, "head",ALL, "h1",ALL, "h2",ALL, "h3",ALL, "h4",ALL, "h5",ALL, "h6",ALL, "hr",ALL, "html",ALL, "i",ALL, "iframe",STRICT & ~NS4, "ilayer",NS4, "img",ALL, "input",ALL, "ins",STRICT&~NS4, "isindex",TRANSITIONAL, "kbd",ALL, "label",STRICT, "layer",NS4, "legend",STRICT, "li",ALL, "link",ALL, "listing",OBSOLETE, "map",ALL, "marquee",IE, "menu",TRANSITIONAL, "meta",ALL, "nextid",IE, "nobr",IE, "noframes",STRICT|IE3, "nolayer",NS4, "noscript",STRICT, "object",STRICT|IE3, "ol",ALL, "optgroup",STRICT & ~IE & ~NS4, "option",ALL, "p",ALL, "param",STRICT, "plaintext",OBSOLETE, "pre",ALL, "q",STRICT, "rt",IE5, "ruby",IE5, "s",TRANSITIONAL, "samp",ALL, "script",ALL, "select",ALL, "small",ALL, "span",STRICT|IE3, "strike",TRANSITIONAL, "strong",ALL, "style",ALL, "sub",ALL, "sup",ALL, "table",ALL, "tbody",STRICT|IE3, "td",ALL, "textarea",ALL, "tfoot",STRICT|IE3, "th",ALL, "thead",STRICT|IE3, "title",ALL, "tr",ALL, "tt",ALL, "u",TRANSITIONAL, "ul",ALL, "var",ALL, "wbr",IE, "xml",IE5, "xmp",OBSOLETE }; /* Lots of "constants" for elements of interest */ /* This list is the elements that are block elements according to MS with a couple of deviations based on experimental evidence. A Block elements has an implied line break,and ends the preceding paragraph. We aren't going to use this directly because we match tags by comparing the addresses of their names,and this depends on string pooling,which might not be implemented by all compilers */ static const char *b_elements[] = { "address", "applet", "blockquote", "caption", "center", "col", "colgroup", "dd", "dir", "div", "dl", "dt", "embed" "fieldset", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "iframe", "ilayer", "layer", "legend", "li", /* MS say this is inline but it causes a line break */ "marquee", "menu", "noframes", "noscript", "object", "ol", "p", "pre", /* "script", */ /* MS say this is a block element but it is not in IE5 */ "table", "ul", "xml" }; /* These are the block elements for which the end tag is optional */ static const char *o_elements[] = { "dd", "dt", "li", "p" }; /* These are the tags for which a closing tag is illegal. They are therefore all "empty" */ const char *i_elements[] = { "!doctype", "area", "base", "basefont", "br", "col", "frame", "hr", "img", "input", "isindex", "link", "meta", "nextid", "param", "wbr" }; /* These are the lists that use the
  • tag */ const static char * l_elements[] = { "dir", "menu", "ol", "ul" }; /*These are the row group elements that can appear within the tag*/ static const char * r_elements[] = { "thead", "tfoot", "tbody" }; /* There are the elements legal inside the head */ static const char * h_elements[] = { "base", "basefont", "bgsound", "link", "meta", "nextid", "script", "style", "title" }; static const char * strings[] = { "type", "name", "value", "text/sitemap", "Local", "Merge" }; /**/ static const char * block_element[NR_ELEMENTS(b_elements)]; static const char * island_element[NR_ELEMENTS(i_elements)]; static const char * list_element[NR_ELEMENTS(l_elements)]; static const char * rowgroup_element[NR_ELEMENTS(r_elements)]; static const char * heading_element[NR_ELEMENTS(h_elements)]; static const char * opt_end_element[NR_ELEMENTS(o_elements)]; typedef const char * el; static el tr_element,th_element,td_element,table_element,colgroup_element,caption_element; static el p_element,ul_element,li_element,script_element,head_element,xmp_element; static el object_element,param_element; static el comment_element = "!--"; static el type_at,name_at,value_at,merge_value,name_value,local_value,sitemap_value; static el plaintext_element,listing_element,pre_element; #define is_block(tagname) is_one_of(tagname,block_element,NR_ELEMENTS(block_element)) #define is_island(tagname) is_one_of(tagname,island_element,NR_ELEMENTS(island_element)) #define is_list(tagname) is_one_of(tagname,list_element,NR_ELEMENTS(list_element)) #define is_rowgroup(tagname) is_one_of(tagname,rowgroup_element,NR_ELEMENTS(rowgroup_element)) #define is_head(tagname) is_one_of(tagname,heading_element,NR_ELEMENTS(heading_element)) #define is_opt_end(tagname) is_one_of(tagname,opt_end_element,NR_ELEMENTS(opt_end_element)) struct ATTRIBUTE { struct ATTRIBUTE * next; const char * name; const char * value; char quoted; char name_relocated; char value_relocated; }; struct tagDOCNODE; typedef struct tagDOCNODE DOCNODE; struct tagDOCNODE { DOCNODE *prev_sibling; DOCNODE *next_sibling; DOCNODE *first_child; DOCNODE *last_child; DOCNODE *parent; struct ATTRIBUTE *first_attribute; const char *name; const char *text; char closing; char closed; char name_relocated; char text_relocated; }; #define TRUE '\1' #define FALSE '\0' typedef int BOOLEAN; static FILE * filenr; static FILE * out_filenr; static DOCNODE * first_tag; static char * line_start; static int current; static int depth = 0; static int started; static int create_js = TRUE; static int create_brs = FALSE; static int create_html = TRUE; static int create_hhm = TRUE; static int merge_chm = TRUE; #define SE_DEFAULT 0 #define SE_RELOCATE 1 #define SE_LCASE 2 int main(int argc,char **argv); static void setup_elements(void); static char setup_element(const char **tag_const, char *tag_name, int flags); static DOCNODE *parse_file(const char *filename,int do_output); static void skip_space(void); static int next_char(void); static DOCNODE *alloc_node(DOCNODE *parent); static DOCNODE *parse_children(DOCNODE *tag); static void free_node(DOCNODE * tag); static void parse_comment(DOCNODE *tag, int xmp); static void parse_attributes(DOCNODE *tag); static char *get_text(int flags); static int permitted(DOCNODE *tag); static BOOLEAN implied_close(const char *old_name, const char *new_name, int closing); static BOOLEAN cross_tag(DOCNODE *tag, const char *new_name); static void report(int errornr,const char * tagname); static int tag_flags(const char *tag_name); static void modify_tree(DOCNODE * tag); static DOCNODE * find_tag(DOCNODE * node, const char * tag_name); static void node_remove(DOCNODE * node); static void node_replace(DOCNODE * old_node,DOCNODE * new_node); static void node_insert_after(DOCNODE * old_node,DOCNODE * new_node); static void node_merge(DOCNODE * tag); static char *fix_filename(const char * filename,const char *suffix); static void output_html(DOCNODE *tag); static void output_js(DOCNODE *tag); static char * fix_text(const char * text); static char * fix_href(char * href); static char * unfix_href(char * href); static void output_hhm(DOCNODE *tag); static void output_brs(DOCNODE *tag); static BOOLEAN is_one_of(const char * tagname,char ** list,int list_size) { int i; for (i = 0; i < list_size;i++) if (tagname == list[i]) return TRUE; return FALSE; } /**/ int main(int argc,char **argv) { int i = 1; setup_elements(); for (i = 1;i < argc;i++) if (stricmp(argv[i],"-nojs")==0) create_js = FALSE; else if (stricmp(argv[i],"-nohtml")==0) create_html = FALSE; else if (stricmp(argv[i],"-nohhm")==0) create_hhm = FALSE; else if (stricmp(argv[i],"-brs")==0) create_brs = TRUE; else if (stricmp(argv[i],"-nochmcontents")==0) merge_chm = FALSE; else parse_file(argv[i],TRUE); return 0; } /**/ static void setup_elements() { int i; setup_element(&tr_element,"tr",SE_DEFAULT); setup_element(&td_element,"td",SE_DEFAULT); setup_element(&th_element,"th",SE_DEFAULT); setup_element(&p_element,"p",SE_DEFAULT); setup_element(&li_element,"li",SE_DEFAULT); setup_element(&caption_element,"caption",SE_DEFAULT); setup_element(&colgroup_element,"colgroup",SE_DEFAULT); setup_element(&script_element,"script",SE_DEFAULT); setup_element(&table_element,"table",SE_DEFAULT); setup_element(&head_element,"head",SE_DEFAULT); setup_element(&xmp_element,"xmp",SE_DEFAULT); setup_element(&ul_element,"ul",SE_DEFAULT); setup_element(¶m_element,"param",SE_DEFAULT); setup_element(&object_element,"object",SE_DEFAULT); setup_element(&listing_element,"listing",SE_DEFAULT); setup_element(&plaintext_element,"plaintext",SE_DEFAULT); setup_element(&pre_element,"pre",SE_DEFAULT); setup_element(&type_at,"type",SE_DEFAULT); setup_element(&name_at,"name",SE_DEFAULT); setup_element(&value_at,"value",SE_DEFAULT); setup_element(&name_value,"Name",SE_DEFAULT); setup_element(&local_value,"Local",SE_DEFAULT); setup_element(&merge_value,"Merge",SE_DEFAULT); setup_element(&sitemap_value,"text/sitemap",SE_DEFAULT); for (i = 0;i < NR_ELEMENTS(b_elements);i++) setup_element(&block_element[i],(char *) b_elements[i],SE_DEFAULT); for (i = 0;i < NR_ELEMENTS(i_elements);i++) setup_element(&island_element[i],(char *) i_elements[i],SE_DEFAULT); for (i = 0;i < NR_ELEMENTS(l_elements);i++) setup_element(&list_element[i],(char *) l_elements[i],SE_DEFAULT); for (i = 0;i < NR_ELEMENTS(h_elements);i++) setup_element(&heading_element[i],(char *) h_elements[i],SE_DEFAULT); for (i = 0;i < NR_ELEMENTS(r_elements);i++) setup_element(&rowgroup_element[i],(char *) r_elements[i],SE_DEFAULT); for (i = 0;i < NR_ELEMENTS(o_elements);i++) setup_element(&opt_end_element[i],(char *) o_elements[i],SE_DEFAULT); } /**/ static char setup_element(const char ** tag_const,char * tag_name,int flags) { int i; for (i =0; i < NR_ELEMENTS(elements);i++) if (stricmp(tag_name,elements[i].name)==0) { *tag_const = elements[i].name; if (flags & SE_RELOCATE) free(tag_name); return TRUE; } for (i =0; i < NR_ELEMENTS(strings);i++) if (stricmp(tag_name,strings[i])==0) { *tag_const = strings[i]; if (flags & SE_RELOCATE) free(tag_name); return TRUE; } *tag_const = tag_name; if (flags & SE_LCASE) str_lcase(tag_name); return FALSE; } DOCNODE * parse_file(const char *filename,int do_output) { DOCNODE *first_tag; const char * s = filename+strlen(filename)-2; while (s > filename && memcmp(s,"::",2)!=0) s--; if (s > filename) { if (!merge_chm) return 0; filename = s+2; } filenr = fopen(filename,"rb"); if (!filenr) { printf("Can't open %s\n",filename); return 0; } printf("Reading %s\n",filename); first_tag = alloc_node(0); next_char(); skip_space(); parse_children(first_tag); fclose(filenr); modify_tree(first_tag); if (do_output) { if (create_js) { started = FALSE; out_filenr = fopen(fix_filename(filename,".js"),"w"); output_js(first_tag); fclose(out_filenr); } if (create_html) { started = FALSE; out_filenr = fopen(fix_filename(filename,".htm"),"w"); output_html(first_tag); fclose(out_filenr); } if (create_hhm) { started = FALSE; out_filenr = fopen(fix_filename(filename,".hhm"),"w"); output_hhm(first_tag); fclose(out_filenr); } if (create_brs) { started = FALSE; out_filenr = fopen(fix_filename(filename,".browse"),"w"); output_brs(first_tag); fclose(out_filenr); } } return first_tag; } /**/ static char *fix_filename(const char * filename,const char *suffix) { #define MAX_PATH 256 static char new_filename[MAX_PATH]; const char *s = filename + strlen(filename); while (s >= filename && *s != '.') s--; if (s < filename) s = filename+strlen(filename); memcpy(new_filename,filename,s-filename); strcpy(new_filename + (s - filename),suffix); return new_filename; } /**/ static void skip_space() { while (current != EOF && is_space(current)) next_char(); } /**/ static int linenr = 0; static int column = 0; static char buffer[4096]; static int buf_index = 0; static int buf_size = 0; static int next_char() { do { if (buf_index == buf_size) { buf_size = fread(buffer,1,sizeof(buffer),filenr); buf_index = 0; line_start = buffer; } if (buf_index == buf_size) return current = EOF; current = buffer[buf_index++]; column++; } while (current == '\r'); if (current == '\n') { column = 0; line_start = buffer+buf_index; linenr++; } return current; } /**/ static DOCNODE * alloc_node(DOCNODE * parent) { DOCNODE * tag = malloc(sizeof(DOCNODE)); memset(tag,0,sizeof(DOCNODE)); tag->parent = parent; return tag; } /**/ static DOCNODE * parse_children(DOCNODE * tag) { DOCNODE ** child = &tag->first_child; DOCNODE *prev = 0; DOCNODE *node; char * word; while (current != EOF) { if (current == '<') { next_char(); if (current == '/') { node = alloc_node(tag); node->closing = TRUE; word = get_text(GW_TAG); node->name_relocated = setup_element(&node->name,word,SE_LCASE|SE_RELOCATE); parse_attributes(node); if (tag->name == node->name || tag->name && node->name && strcmp(tag->name,node->name)==0) { tag->closed = 1; free_node(node); return 0; } if (implied_close(tag->name,node->name,TRUE)) return node; /* Crossed tags! - Make a reasonable guess as to whether to ignore closing tag or imply closing tags*/ if (cross_tag(tag,node->name)) return node; } else if (is_letter(current) || current == '!') { word = get_text(GW_TAG|GW_CURRENT); node = alloc_node(tag); setup_element(&node->name,word,SE_RELOCATE|SE_LCASE); if (strlen(node->name) >= 3 && memcmp(node->name,"!--",3)==0) { parse_comment(node,FALSE); tag->last_child = *child = node; node->closed = (char) (current != EOF); node->prev_sibling = prev; prev = node; child = &node->next_sibling; } else { int errornr; parse_attributes(node); while (node) { node->parent = tag; if (node->closing && (tag->name == node->name || tag->name && node->name && strcmp(tag->name,node->name)==0)) { tag->closed = 1; free_node(node); return 0; } if (implied_close(tag->name,node->name,node->closing)) return node; errornr = permitted(node); if (errornr) { printf("%s %s %d %d %d\n",tag->name,node->name,tag->name,node->name,li_element); report(errornr,node->name); } if (node->closing) { /* Crossed tags! - Make a reasonable guess as to whether to ignore closing tag or imply closing tags*/ if (cross_tag(tag,node->name)) return node; } if (!node->closing) { tag->last_child = *child = node; node->prev_sibling = prev; prev = node; child = &node->next_sibling; if (is_island(node->name)) node = 0; else if (node->name != xmp_element) node = parse_children(node); else { parse_comment(node,TRUE); node->closed = (char) (current != EOF); /* Skip past closing tag */ node = alloc_node(tag); parse_attributes(node); free_node(node); node = 0; } } } } } else { /*treat as text */ tag->last_child = *child = node = alloc_node(tag); node->prev_sibling = prev; prev = node; child = &node->next_sibling; node->text = get_text(tag_flags(tag->name)|GW_CURRENT|GW_LT); } } else { tag->last_child = *child = node = alloc_node(tag); node->prev_sibling = prev; prev = node; child = &node->next_sibling; node->text = get_text(tag_flags(tag->name)|GW_CURRENT); } } } /**/ static void free_node(DOCNODE * node) { struct ATTRIBUTE * attribute; DOCNODE * child, * next_node; for (attribute = node->first_attribute;attribute;attribute = node->first_attribute) { node->first_attribute = attribute->next; if (!attribute->name_relocated) free((void *) attribute->name); if (!attribute->value_relocated && attribute->value) free((void *) attribute->value); free(attribute); } if (node->name && !node->name_relocated) free((void *) node->name); if (node->text && !node->text_relocated) free((void *) node->text); for (child = node->first_child;child;child = next_node) { next_node = child->next_sibling; free_node(child); } free(node); } /**/ static void parse_comment(DOCNODE * tag,int xmp) { size_t text_size = strlen(tag->name)-3; size_t buf_size = max((text_size+4095) & ~4095,4096); char * text = malloc(buf_size); if (!xmp) { memcpy(text,tag->name+3,text_size); free((void *) tag->name); tag->name = comment_element; tag->name_relocated = TRUE; } while (current != EOF) { if (text_size == buf_size) { if (buf_size) { buf_size += 4096; text = realloc(text,buf_size); } } text[text_size++] = (char) current; if (xmp ? text_size >= 6 && memicmp(text+text_size-6,"' || is_space(current)): current == '>' && text_size >= 3 && memcmp(text+text_size-3,"-->",3)==0) { text_size -= xmp ? 6 : 3; break; } next_char(); } if (text) { text = realloc(text,text_size+1); text[text_size] = 0; } if (!xmp) next_char(); tag->text = text; } /**/ static void parse_attributes(DOCNODE * tag) { int c = current; char * word; struct ATTRIBUTE ** pattr = &tag->first_attribute; struct ATTRIBUTE * attribute; do { skip_space(); if (current != '>' && current != EOF) { word = get_text(GW_CURRENT+GW_ATTRIBUTE); *pattr = attribute = malloc(sizeof(struct ATTRIBUTE)); attribute->name_relocated = setup_element(&attribute->name,word,SE_LCASE|SE_RELOCATE); attribute->value = 0; attribute->next = 0; pattr = &attribute->next; skip_space(); if (current == '=') { word = get_text(GW_VALUE); if (word[0] == '"' || word[0] == '\'') { attribute->quoted = word[0]; strcpy(word,word+1); word[strlen(word)-1] = '\0'; } attribute->value_relocated = setup_element(&attribute->value,word,SE_RELOCATE); } } } while (current != '>' && current != EOF); next_char(); /* Skip past closing > */ } /**/ static char * get_text(int flags) { size_t buf_size = 0; size_t text_size = 0; char * text = 0; int c = flags & GW_CURRENT ? current : next_char(); int in_quotes = 0; int in_escape = 0; int quote = 0; if (flags & GW_LT) { text = malloc(buf_size = 4096); text[text_size++] = '<'; } if (flags & GW_VALUE+GW_ATTRIBUTE) { skip_space(); if (current == '\'' || current == '"') { quote = current; in_quotes = TRUE; text = malloc(buf_size = 4096); text[text_size++] = (char) quote; c = next_char(); } } while (c != EOF) { if (text_size == buf_size) { if (buf_size) { buf_size += 4096; text = realloc(text,buf_size); } else text = malloc(buf_size = 4096); } if (is_space(c) && flags & GW_REFORMAT) { in_escape = 0; if (flags & GW_REFORMAT) { text[text_size++] = ' '; do c = next_char(); while (c != EOF && is_space(c)); } } else if (is_space(c) && flags & GW_TAG+GW_ATTRIBUTE+GW_VALUE || flags & GW_TAG+GW_ATTRIBUTE && c=='>' || flags & GW_ATTRIBUTE && c=='=' || flags & GW_TEXT && c=='<') { in_escape = 0; if (!in_quotes) break; text[text_size++] = (char) c; c = next_char(); } else { text[text_size++] = (char) c; if (in_escape) in_escape = FALSE; else if (flags & GW_ESCAPE && c == '\\') in_escape = TRUE; else if (in_quotes && c == quote) { in_quotes = FALSE; next_char(); break; } c = next_char(); } } if (text_size) { text = realloc(text,text_size+1); text[text_size] = '\0'; return text; } return 0; } /**/ char *errors[]= { "", "unexpected element <%s>: \n", "found <%s>: only \n", "unexpected element <%s>: ,,, elements are only permitted in
    and elements are only permitted in
    and elements are permitted in
    ,
    \n", "
  • is only permitted in ,,
      ,
        \n", "element <%s> is only permitted in \n", "element <%s> is not permitted in \n", "unexpected closing tag \n" }; /**/ static int permitted(DOCNODE * tag) { if ((tag->name == td_element || tag->name == th_element) != (tag->parent->name == tr_element)) return tag->parent->name == tr_element ? 2 : 1; if ((is_rowgroup(tag->name) || tag->name==colgroup_element || tag->name == caption_element) && tag->parent->name != table_element) return 3; if (tag->name == li_element && !is_list(tag->parent->name)) { printf(tag->parent->name); return 4; } if (tag->name != script_element && is_head(tag->name) != (tag->parent->name==head_element)) return tag->parent->name == head_element ? 6 : 5; return 0; } /**/ static BOOLEAN implied_close(const char * old_name,const char * new_name,int closing) { if (old_name == td_element || old_name == th_element) return new_name == td_element || new_name == th_element || new_name == tr_element || is_rowgroup(new_name) || new_name == table_element && closing; if (old_name == tr_element) return new_name == tr_element || is_rowgroup(new_name) || new_name == table_element && closing; if (old_name == li_element) return new_name == li_element || is_list(new_name) && closing; if (old_name == p_element) return is_block(new_name); if (is_rowgroup(old_name)) return is_rowgroup(new_name); } /**/ static BOOLEAN cross_tag(DOCNODE * tag,const char * new_name) { report(7,new_name); for (tag = tag->parent;tag;tag = tag->parent) { if (tag->name == new_name || tag->name && new_name && strcmp(tag->name,new_name)==0 || implied_close(tag->name,new_name,TRUE)) return TRUE; if (is_block(tag->name) && !is_opt_end(tag->name)) return FALSE; } return 0; } /**/ static void report(int errornr,const char * tagname) { char buffer[81]; const char *s = line_start; char *t; printf(errors[errornr],tagname); for (t = buffer;*s && t < buffer+79;s++,t++) *t = *s; *t = '\0'; printf("%s\n",buffer); } /**/ static int tag_flags(const char * tag_name) { if (tag_name) { if (tag_name == plaintext_element) return 0; /* nothing from now on gets interpreted! */ if (tag_name == listing_element || tag_name == pre_element) return GW_TEXT; } return GW_TEXT|GW_REFORMAT; } /**/ static void modify_tree(DOCNODE * tag) { struct ATTRIBUTE * attribute; DOCNODE * child, * next_node; int is_sitemap = FALSE; if (!tag->name && tag->text && strcmp(tag->text," ")==0) { node_remove(tag); free_node(tag); return; } if (tag->name == object_element) { for (attribute = tag->first_attribute;attribute;attribute = attribute->next) if (attribute->name == type_at && attribute->value == sitemap_value) is_sitemap = TRUE; if (is_sitemap) { for (child = tag->first_child;child;child = next_node) { next_node = child->next_sibling; if (child->name == param_element) { const char * name = 0; const char * value = 0; for (attribute = child->first_attribute;attribute;attribute = attribute->next) { if (attribute->name == name_at) name = attribute->value; if (attribute->name == value_at) value = attribute->value; } if (name == merge_value) { DOCNODE * child_doc = parse_file(value,FALSE); if (child_doc) { DOCNODE * outer_doc; int first_node = TRUE; do { outer_doc = find_tag(child_doc,ul_element); if (outer_doc) { node_remove(outer_doc); if (first_node) { node_replace(tag,outer_doc); first_node = FALSE; } else node_insert_after(tag,outer_doc); tag = outer_doc; if (tag->prev_sibling && tag->prev_sibling->name == ul_element) { outer_doc = tag->prev_sibling; node_merge(tag); tag = outer_doc; } } } while (outer_doc); return; } else if (!tag->text) { tag->text = "\n"; tag->text_relocated = TRUE; } } } } } } for (child = tag->first_child;child;child = next_node) { next_node = child->next_sibling; modify_tree(child); } if (tag->name == ul_element) if (tag->prev_sibling && tag->prev_sibling->name == ul_element) node_merge(tag); } /**/ static DOCNODE * find_tag(DOCNODE * node, const char * tag_name) { DOCNODE * child; if (node->name == tag_name) return node; for (child = node->first_child;child;child = child->next_sibling) { node = find_tag(child,tag_name); if (node) return node; } return 0; } /**/ static void node_remove(DOCNODE * node) { if (node->prev_sibling) node->prev_sibling->next_sibling = node->next_sibling; else node->parent->first_child = node->next_sibling; if (node->next_sibling) node->next_sibling->prev_sibling = node->prev_sibling; else node->parent->last_child = node->prev_sibling; } /**/ static void node_replace(DOCNODE * old_node,DOCNODE * new_node) { new_node->parent = old_node->parent; new_node->prev_sibling = old_node->prev_sibling; new_node->next_sibling = old_node->next_sibling; if (new_node->prev_sibling) new_node->prev_sibling->next_sibling = new_node; else new_node->parent->first_child = new_node; if (new_node->next_sibling) new_node->next_sibling->prev_sibling = new_node; else new_node->parent->last_child = new_node; } /**/ static void node_insert_after(DOCNODE * old_node,DOCNODE * new_node) { new_node->parent = old_node->parent; new_node->prev_sibling = old_node; new_node->next_sibling = old_node->next_sibling; new_node->prev_sibling->next_sibling = new_node; if (new_node->next_sibling) new_node->next_sibling->prev_sibling = new_node; else new_node->parent->last_child = new_node; } /**/ static void node_merge(DOCNODE * tag) { DOCNODE * child; if (tag->prev_sibling->last_child) tag->prev_sibling->last_child->next_sibling = tag->first_child; else tag->prev_sibling->first_child = tag->first_child; if (tag->last_child) { tag->first_child->prev_sibling = tag->prev_sibling->last_child; tag->prev_sibling->last_child = tag->last_child; for (child = tag->first_child;child;child = child->next_sibling) child->parent = tag->prev_sibling; if (tag->first_child->name == ul_element && tag->first_child->prev_sibling && tag->first_child->prev_sibling->name == ul_element) node_merge(tag->first_child); tag->first_child = tag->last_child = 0; } node_remove(tag); free_node(tag); } /**/ static void output_hhm(DOCNODE * tag) { struct ATTRIBUTE * attribute; DOCNODE * child; if (tag->name) { fprintf(out_filenr,"<%s",tag->name); for (attribute = tag->first_attribute;attribute;attribute = attribute->next) { fprintf(out_filenr," %s",attribute->name); if (attribute->value) if (attribute->quoted) fprintf(out_filenr,"=%c%s%c",attribute->quoted,attribute->value,attribute->quoted); else fprintf(out_filenr,"=%s",attribute->value); } if (tag->name != comment_element) fprintf(out_filenr,">"); } if (tag->text) fprintf(out_filenr,tag->text); for (child = tag->first_child;child;child = child->next_sibling) output_hhm(child); if (tag->name && tag->closed) if (tag->name == object_element) fprintf(out_filenr,"\n\n"); else if (tag->name != comment_element) fprintf(out_filenr,"\n",tag->name); else fprintf(out_filenr,"-->\n"); } /**/ static void output_js(DOCNODE * tag) { struct ATTRIBUTE * attribute; DOCNODE * child, * next_node; int is_sitemap = FALSE; int was_started = started; if (tag->name == ul_element && started) { if (depth) fprintf(out_filenr,"ms[d++] = m;\nm = new M();\n"); else fprintf(out_filenr,"m = new M();\n"); depth++; } if (tag->name == object_element) { for (attribute = tag->first_attribute;attribute;attribute = attribute->next) if (attribute->name == type_at && attribute->value == sitemap_value) is_sitemap = TRUE; if (is_sitemap) { char * href = 0; const char * text = 0; for (child = tag->first_child;child;child = next_node) { next_node = child->next_sibling; if (child->name == param_element) { const char * name = 0; const char * value = 0; for (attribute = child->first_attribute;attribute;attribute = attribute->next) { if (attribute->name == name_at) name = attribute->value; if (attribute->name == value_at) value = attribute->value; } if (name == name_value) text = value; if (name == local_value) href = (char *) value; } } if (started) { if (text && href) fprintf(out_filenr,"m.A(\"%s\",\"%s\");\n",fix_text(text),fix_href(href)); else if (text) fprintf(out_filenr,"m.A(\"%s\");\n",fix_text(text)); } else { fprintf(out_filenr,"var ms = new Array;\n" "var d = 0;\n" "var m = null;\n" "var menu = null;\n" "MText = \"%s\"\n",text); started = TRUE; } } } for (child = tag->first_child;child;child = child->next_sibling) output_js(child); if (tag->name == ul_element && was_started) { if (--depth) fprintf(out_filenr,"ms[--d].ML(m);\n" "m = ms[d];\n"); else fprintf(out_filenr,"menu=m;\n"); } } /**/ static char * fix_text(const char * text) { static char *buffer = 0; static size_t buffer_size = 0; const char *s = text; char *t; if (buffer_size < 2*strlen(text)+1) { if (buffer) free(buffer); buffer = malloc(2*strlen(text)+1); } for (t = buffer;*s;t++,s++) { if (*s == '\\' || *s == '"') *t++ = '\\'; *t = *s; } *t = 0; return buffer; } /**/ static char * fix_href(char * href) { char * t; for (t = href;*t;t++) { if (*t == '\\') *t = '/'; } return href; } /**/ static char * unfix_href(char * href) { char * t; for (t = href;*t;t++) { if (*t == '/') *t = '\\'; } return href; } /**/ static void output_html(DOCNODE * node) { struct ATTRIBUTE * attribute; DOCNODE * child, * next_node; int is_sitemap = FALSE; int was_started = started; if (node->name == ul_element && started) fprintf(out_filenr,"
          \n"); if (node->name == object_element) { for (attribute = node->first_attribute;attribute;attribute = attribute->next) if (attribute->name == type_at && attribute->value == sitemap_value) is_sitemap = TRUE; if (is_sitemap) { char * href = 0; const char * text = 0; for (child = node->first_child;child;child = next_node) { next_node = child->next_sibling; if (child->name == param_element) { const char * name = 0; const char * value = 0; for (attribute = child->first_attribute;attribute;attribute = attribute->next) { if (attribute->name == name_at) name = attribute->value; if (attribute->name == value_at) value = attribute->value; } if (name == name_value) text = value; if (name == local_value) href = (char *) value; } } if (!started) { fprintf(out_filenr, "\n" "\n \n%s\n" "" "\n
            \n",text,text); started = TRUE; } if (text && href) fprintf(out_filenr,"
          • %s\n",fix_href(href),text); else if (text) fprintf(out_filenr,"
          • %s\n",text); } } for (child = node->first_child;child;child = child->next_sibling) output_html(child); if (node->name == ul_element) if (was_started) fprintf(out_filenr,"
          \n"); else { fprintf(out_filenr,"
        \n\n\n"); started = was_started = FALSE; } } /**/ static void output_brs(DOCNODE * node) { struct ATTRIBUTE * attribute; DOCNODE * child, * next_node; int is_sitemap = FALSE; int was_started = started; if (node->name == object_element) { for (attribute = node->first_attribute;attribute;attribute = attribute->next) if (attribute->name == type_at && attribute->value == sitemap_value) is_sitemap = TRUE; if (is_sitemap) { char * href = 0; const char * text = 0; for (child = node->first_child;child;child = next_node) { next_node = child->next_sibling; if (child->name == param_element) { const char * name = 0; const char * value = 0; for (attribute = child->first_attribute;attribute;attribute = attribute->next) { if (attribute->name == name_at) name = attribute->value; if (attribute->name == value_at) value = attribute->value; } if (name == name_value) text = value; if (name == local_value) href = (char *) value; } } if (!started) { fprintf(out_filenr,"[Book Like]\n"); started = TRUE; } if (text && href) fprintf(out_filenr,"%s|%s\n",text,unfix_href(href)); } } for (child = node->first_child;child;child = child->next_sibling) output_brs(child); }