From 8f8a9dd7f1fef20cdc2536c79bf2b349aeecddd8 Mon Sep 17 00:00:00 2001
From: Daniel Veillard <veillard@src.gnome.org>
Date: Tue, 25 Jan 2005 21:41:42 +0000
Subject: [PATCH] found and fixed 2 problems in the internal subset scanning
 code affecting

* parser.c: found and fixed 2 problems in the internal subset scanning
  code affecting the push parser (and the reader), fixes #165126
* test/intsubset2.xml result//intsubset2.xml*: added the test case
  to the regression tests.
Daniel
---
 ChangeLog                   |   7 +
 parser.c                    |  34 ++++-
 result/intsubset2.xml       | 250 +++++++++++++++++++++++++++++++
 result/intsubset2.xml.rde   |   5 +
 result/intsubset2.xml.rdr   |   5 +
 result/intsubset2.xml.sax   | 286 ++++++++++++++++++++++++++++++++++++
 result/noent/intsubset2.xml | 250 +++++++++++++++++++++++++++++++
 test/intsubset2.xml         | 282 +++++++++++++++++++++++++++++++++++
 8 files changed, 1113 insertions(+), 6 deletions(-)
 create mode 100644 result/intsubset2.xml
 create mode 100644 result/intsubset2.xml.rde
 create mode 100644 result/intsubset2.xml.rdr
 create mode 100644 result/intsubset2.xml.sax
 create mode 100644 result/noent/intsubset2.xml
 create mode 100644 test/intsubset2.xml
diff --git a/ChangeLog b/ChangeLog
index 94454835..c66d0646 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+Tue Jan 25 22:39:33 CET 2005 Daniel Veillard <daniel@veillard.com>
+
+	* parser.c: found and fixed 2 problems in the internal subset scanning
+	  code affecting the push parser (and the reader), fixes #165126
+	* test/intsubset2.xml result//intsubset2.xml*: added the test case
+	  to the regression tests.
+
 Tue Jan 25 01:20:11 CET 2005 Daniel Veillard <daniel@veillard.com>
 
 	* testdso.c xmlregexp.c: warning patches from Peter Breitenlohner
diff --git a/parser.c b/parser.c
index b9047948..9aa9698c 100644
--- a/parser.c
+++ b/parser.c
@@ -9861,8 +9861,12 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
 				    break;
 				}
 		            }
-			    if (!found)
-			        break;
+			    if (!found) {
+#if 0
+			        fprintf(stderr, "unfinished comment\n");
+#endif
+			        break; /* for */
+		            }
 		            continue;
 			}
 		    }
@@ -9875,6 +9879,10 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
 			continue;
 		    }
 		    if (buf[base] == ']') {
+#if 0
+		        fprintf(stderr, "%c%c%c%c: ", buf[base],
+			        buf[base + 1], buf[base + 2], buf[base + 3]);
+#endif
 		        if ((unsigned int) base +1 >=
 		            ctxt->input->buf->buffer->use)
 			    break;
@@ -9883,20 +9891,34 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
 			    base++;
 			    continue;
 			}
-		        for (i = 0;
+		        for (i = 1;
 		     (unsigned int) base + i < ctxt->input->buf->buffer->use;
 		             i++) {
-			    if (buf[base + i] == '>')
+			    if (buf[base + i] == '>') {
+#if 0
+			        fprintf(stderr, "found\n");
+#endif
 			        goto found_end_int_subset;
+			    }
+			    if (!IS_BLANK_CH(buf[base + i])) {
+#if 0
+			        fprintf(stderr, "not found\n");
+#endif
+			        goto not_end_of_int_subset;
+			    }
 			}
+#if 0
+			fprintf(stderr, "end of stream\n");
+#endif
 		        break;
+                        
 		    }
+not_end_of_int_subset:
+                    continue; /* for */
 		}
 		/*
 		 * We didn't found the end of the Internal subset
 		 */
-		if (quote == 0) 
-		    ctxt->checkIndex = base;
 #ifdef DEBUG_PUSH
 		if (next == 0)
 		    xmlGenericError(xmlGenericErrorContext,
diff --git a/result/intsubset2.xml b/result/intsubset2.xml
new file mode 100644
index 00000000..b1039553
--- /dev/null
+++ b/result/intsubset2.xml
@@ -0,0 +1,250 @@
+<?xml version="1.0"?>
+<!DOCTYPE kanjidic2 [
+<!-- Version 1.3
+	This is the DTD of the XML-format kanji file combining information from
+	the KANJIDIC and KANJD212 files. It is intended to be largely self-
+	documenting, with each field being accompanied by an explanatory
+	comment.
+
+	The file covers the following kanji:
+	(a) the 6,355 kanji from JIS X 0208;
+	(b) the 5,801 kanji from JIS X 0212;
+	(c) the 3,625 kanji from JIS X 0213 as follows:
+		(i) the 2,741 kanji which are also in JIS X 0212 have
+		JIS X 0213 code-points (kuten) added to the existing entry;
+		(ii) the 884 "new" kanji have new entries.
+
+	At the end of the explanation for a number of fields there is a tag
+	with the format [N]. This indicates the leading letter(s) of the
+	equivalent field in the KANJIDIC and KANJD212 files.
+
+	The KANJIDIC documentation should also be read for additional 
+	information about the information in the file.
+	--><!ELEMENT kanjidic2 (header , character*)>
+<!ELEMENT header (file_version , database_version , date_of_creation)>
+<!--
+	The single header element will contain identification information
+	about the version of the file 
+	--><!ELEMENT file_version (#PCDATA)>
+<!--
+	This field denotes the version of kanjidic2 structure, as more
+	than one version may exist.
+	--><!ELEMENT database_version (#PCDATA)>
+<!--
+	The version of the file, in the format YYYY-NN, where NN will be
+	a number starting with 01 for the first version released in a
+	calendar year, then increasing for each version in that year.
+	--><!ELEMENT date_of_creation (#PCDATA)>
+<!--
+	The date the file was created in international format (YYYY-MM-DD).
+	--><!ELEMENT character (literal , codepoint , radical , misc , dic_number? , query_code? , reading_meaning? , nanori?)*>
+<!ELEMENT literal (#PCDATA)>
+<!--
+	The character itself in UTF8 coding.
+	--><!ELEMENT codepoint (cp_value)+>
+<!-- 
+	The codepoint element states the code of the character in the various
+	character set standards.
+	--><!ELEMENT cp_value (#PCDATA)>
+<!-- 
+	The cp_value contains the codepoint of the character in a particular
+	standard. The standard will be identified in the cp_type attribute.
+	--><!ATTLIST cp_value cp_type CDATA #REQUIRED>
+<!-- 
+	The cp_type attribute states the coding standard applying to the
+	element. The values assigned so far are:
+		jis208 - JIS X 0208-1997 - kuten coding (nn-nn)
+		jis212 - JIS X 0212-1990 - kuten coding (nn-nn)
+		jis213 - JIS X 0213-2000 - kuten coding (p-nn-nn)
+		ucs - Unicode 4.0 - hex coding (4 or 5 hexadecimal digits)
+	--><!ELEMENT radical (rad_value)+>
+<!ELEMENT rad_value (#PCDATA)>
+<!-- 
+	The radical number, in the range 1 to 214. The particular
+	classification type is stated in the rad_type attribute.
+	--><!ATTLIST rad_value rad_type CDATA #REQUIRED>
+<!-- 
+	The rad_type attribute states the type of radical classification.
+		classical - as recorded in the KangXi Zidian.
+		nelson - as used in the Nelson "Modern Japanese-English 
+		Character Dictionary" (i.e. the Classic, not the New Nelson).
+		This will only be used where Nelson reclassified the kanji.
+	--><!ELEMENT misc (grade? , stroke_count+ , variant* , freq* , rad_name*)>
+<!ELEMENT grade (#PCDATA)>
+<!-- 
+	The Jouyou Kanji grade level. 1 through 6 indicate the grade in which
+	the kanji is taught in Japanese schools. 8 indicates it is one of the
+	remaining Jouyou Kanji to be learned in junior high school, and 9 
+	indicates it is a Jinmeiyou (for use in names) kanji. [G]
+	--><!ELEMENT stroke_count (#PCDATA)>
+<!-- 
+	The stroke count of the kanji, including the radical. If more than 
+	one, the first is considered the accepted count, while subsequent ones 
+	are common miscounts. (See Appendix E. of the KANJIDIC documentation
+	for some of the rules applied when counting strokes in some of the 
+	radicals.) [S]
+	--><!ELEMENT variant (#PCDATA)>
+<!-- 
+	A cross-reference code to another kanji, usually regarded as a variant.
+	The type of cross-reference is given in the var_type attribute.
+	--><!ATTLIST variant var_type CDATA #REQUIRED>
+<!-- 
+	The var_type attribute indicates the type of variant code. The current
+	values are: 
+		jis208 - in JIS X 0208 - kuten coding
+		jis212 - in JIS X 0212 - kuten coding
+		jis213 - in JIS X 0213 - kuten coding
+		deroo - De Roo number - numeric
+		njecd - Halpern NJECD index number - numeric
+		s_h - The Kanji Dictionary (Spahn & Hadamitzky) - descriptor
+		nelson - "Classic" Nelson - numeric
+		oneill - Japanese Names (O'Neill) - numeric
+	--><!ELEMENT freq (#PCDATA)>
+<!-- 
+	A frequency-of-use ranking. The 2,500 most-used characters have a 
+	ranking; those characters that lack this field are not ranked. The 
+	frequency is a number from 1 to 2,500 that expresses the relative 
+	frequency of occurrence of a character in modern Japanese. This is
+	based on a survey in newspapers, so it is biassed towards kanji
+	used in newspaper articles. The discrimination between the less
+	frequently used kanji is not strong.
+	--><!ELEMENT rad_name (#PCDATA)>
+<!-- 
+	When the kanji is itself a radical and has a name, this element
+	contains the name (in hiragana.) [T2]
+	--><!ELEMENT dic_number (dic_ref)+>
+<!-- 
+	This element contains the index numbers and similar unstructured
+	information such as page numbers in a number of published dictionaries,
+	and instructional books on kanji.
+	--><!ELEMENT dic_ref (#PCDATA)>
+<!-- 
+	Each dic_ref contains an index number. The particular dictionary,
+	etc. is defined by the dr_type attribute.
+	--><!ATTLIST dic_ref dr_type CDATA #REQUIRED>
+<!-- 
+	The dr_type defines the dictionary or reference book, etc. to which
+	dic_ref element applies. The initial allocation is:
+	  nelson_c - "Modern Reader's Japanese-English Character Dictionary",  
+	  	edited by Andrew Nelson (now published as the "Classic" 
+	  	Nelson).
+	  nelson_n - "The New Nelson Japanese-English Character Dictionary", 
+	  	edited by John Haig.
+	  halpern_njecd - "New Japanese-English Character Dictionary", 
+	  	edited by Jack Halpern.
+	  halpern_kkld - "Kanji Learners Dictionary" (Kodansha) edited by 
+	  	Jack Halpern.
+	  heisig - "Remembering The  Kanji"  by  James Heisig.
+	  gakken - "A  New Dictionary of Kanji Usage" (Gakken)
+	  oneill_names - "Japanese Names", by P.G. O'Neill. 
+	  oneill_kk - "Essential Kanji" by P.G. O'Neill.
+	  moro - "Daikanwajiten" compiled by Morohashi. For some kanji two
+	  	additional attributes are used: m_vol:  the volume of the
+	  	dictionary in which the kanji is found, and m_page: the page
+	  	number in the volume.
+	  henshall - "A Guide To Remembering Japanese Characters" by
+	  	Kenneth G.  Henshall.
+	  sh_kk - "Kanji and Kana" by Spahn and Hadamitzky.
+	  sakade - "A Guide To Reading and Writing Japanese" edited by
+	  	Florence Sakade.
+	  henshall3 - "A Guide To Reading and Writing Japanese" 3rd
+		edition, edited by Henshall, Seeley and De Groot.
+	  tutt_cards - Tuttle Kanji Cards, compiled by Alexander Kask.
+	  crowley - "The Kanji Way to Japanese Language Power" by
+	  	Dale Crowley.
+	  kanji_in_context - "Kanji in Context" by Nishiguchi and Kono.
+	  busy_people - "Japanese For Busy People" vols I-III, published
+		by the AJLT. The codes are the volume.chapter.
+	  kodansha_compact - the "Kodansha Compact Kanji Guide".
+	--><!ATTLIST dic_ref m_vol CDATA #IMPLIED>
+<!-- 
+	See above under "moro".
+	--><!ATTLIST dic_ref m_page CDATA #IMPLIED>
+<!-- 
+	See above under "moro".
+	--><!ELEMENT query_code (q_code)+>
+<!-- 
+	These codes contain information relating to the glyph, and can be used
+	for finding a required kanji. The type of code is defined by the
+	qc_type attribute.
+	--><!ELEMENT q_code (#PCDATA)>
+<!--
+	The q_code contains the actual query-code value, according to the
+	qc_type attribute.
+	--><!ATTLIST q_code qc_type CDATA #REQUIRED>
+<!-- 
+	The q_code attribute defines the type of query code. The current values
+	are:
+	  skip -  Halpern's SKIP (System  of  Kanji  Indexing  by  Patterns) 
+	  	code. The  format is n-nn-nn.  See the KANJIDIC  documentation 
+	  	for  a description of the code and restrictions on  the 
+	  	commercial  use  of this data. [P]
+
+	  sh_desc - the descriptor codes for The Kanji Dictionary (Tuttle 
+	  	1996) by Spahn and Hadamitzky. They are in the form nxnn.n,  
+	  	e.g.  3k11.2, where the  kanji has 3 strokes in the 
+	  	identifying radical, it is radical "k" in the SH 
+	  	classification system, there are 11 other strokes, and it is 
+	  	the 2nd kanji in the 3k11 sequence. (I am very grateful to 
+	  	Mark Spahn for providing the list of these descriptor codes 
+	  	for the kanji in this file.) [I]
+	  four_corner - the "Four Corner" code for the kanji. This is a code 
+	  	invented by Wang Chen in 1928. See the KANJIDIC documentation 
+	  	for  an overview of  the Four Corner System. [Q]
+
+	  deroo - the codes developed by the late Father Joseph De Roo, and 
+	  	published in  his book "2001 Kanji" (Bojinsha). Fr De Roo 
+	  	gave his permission for these codes to be included. [DR]
+	  misclass - a possible misclassification of the kanji according
+		to one of the code types. (See the "Z" codes in the KANJIDIC
+		documentation for more details.)
+	  
+	--><!ELEMENT reading_meaning (rmgroup* , nanori*)>
+<!-- 
+	The readings for the kanji in several languages, and the meanings, also
+	in several languages. The readings and meanings are grouped to enable
+	the handling of the situation where the meaning is differentiated by 
+	reading. [T1]
+	--><!ELEMENT nanori (#PCDATA)>
+<!-- 
+	Japanese readings that are now only associated with names.
+	--><!ELEMENT rmgroup (reading* , meaning*)>
+<!ELEMENT reading (#PCDATA)>
+<!-- 
+	The reading element contains the reading or pronunciation
+	of the kanji.
+	--><!ATTLIST reading r_type CDATA #REQUIRED>
+<!-- 
+	The r_type attribute defines the type of reading in the reading
+	element. The current values are:
+	  pinyin - the modern PinYin romanization of the Chinese reading 
+	  	of the kanji. The tones are represented by a concluding 
+	  	digit. [Y]
+	  korean_r - the romanized form of the Korean reading(s) of the 
+	  	kanji.  The readings are in the (Republic of Korea) Ministry 
+	  	of Education style of romanization. [W]
+	  korean_h - the Korean reading(s) of the kanji in hangul.
+	  ja_on - the "on" Japanese reading of the kanji, in katakana. A
+	  	second attribute r_status, if present, will indicate with
+	  	a value of "jy" whether the reading is approved for a
+	  	"Jouyou kanji".
+	  ja_kun - the "kun" Japanese reading of the kanji, in hiragana. 
+	  	Where relevant the okurigana is also included separated by a 
+	  	".". Readings associated with prefixes and suffixes are 
+	  	marked with a "-". A second attribute r_status, if present, 
+	  	will indicate with a value of "jy" whether the reading is 
+	  	approved for a "Jouyou kanji".
+	--><!ATTLIST reading r_status CDATA #IMPLIED>
+<!-- 
+	See under ja_on and ja_kun above.
+	--><!ELEMENT meaning (#PCDATA)>
+<!-- 
+	The meaning associated with the kanji.
+	--><!ATTLIST meaning m_lang CDATA #IMPLIED>
+<!-- 
+	The m_lang attribute defines the target language of the meaning. It 
+	will be coded using the two-letter language code from the ISO 639 
+	standard. When absent, the value "en" (i.e. English) is implied. [{}]
+	-->]>
+<kanjidic2>
+</kanjidic2>
diff --git a/result/intsubset2.xml.rde b/result/intsubset2.xml.rde
new file mode 100644
index 00000000..d27e245e
--- /dev/null
+++ b/result/intsubset2.xml.rde
@@ -0,0 +1,5 @@
+0 10 kanjidic2 0 0
+0 1 kanjidic2 0 0
+1 14 #text 0 1 
+
+0 15 kanjidic2 0 0
diff --git a/result/intsubset2.xml.rdr b/result/intsubset2.xml.rdr
new file mode 100644
index 00000000..d27e245e
--- /dev/null
+++ b/result/intsubset2.xml.rdr
@@ -0,0 +1,5 @@
+0 10 kanjidic2 0 0
+0 1 kanjidic2 0 0
+1 14 #text 0 1 
+
+0 15 kanjidic2 0 0
diff --git a/result/intsubset2.xml.sax b/result/intsubset2.xml.sax
new file mode 100644
index 00000000..b4d7bf46
--- /dev/null
+++ b/result/intsubset2.xml.sax
@@ -0,0 +1,286 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.internalSubset(kanjidic2, , )
+SAX.comment( Version 1.3
+	This is the DTD of the XML-format kanji file combining information from
+	the KANJIDIC and KANJD212 files. It is intended to be largely self-
+	documenting, with each field being accompanied by an explanatory
+	comment.
+
+	The file covers the following kanji:
+	(a) the 6,355 kanji from JIS X 0208;
+	(b) the 5,801 kanji from JIS X 0212;
+	(c) the 3,625 kanji from JIS X 0213 as follows:
+		(i) the 2,741 kanji which are also in JIS X 0212 have
+		JIS X 0213 code-points (kuten) added to the existing entry;
+		(ii) the 884 "new" kanji have new entries.
+
+	At the end of the explanation for a number of fields there is a tag
+	with the format [N]. This indicates the leading letter(s) of the
+	equivalent field in the KANJIDIC and KANJD212 files.
+
+	The KANJIDIC documentation should also be read for additional 
+	information about the information in the file.
+	)
+SAX.elementDecl(kanjidic2, 4, ...)
+SAX.elementDecl(header, 4, ...)
+SAX.comment(
+	The single header element will contain identification information
+	about the version of the file 
+	)
+SAX.elementDecl(file_version, 3, ...)
+SAX.comment(
+	This field denotes the version of kanjidic2 structure, as more
+	than one version may exist.
+	)
+SAX.elementDecl(database_version, 3, ...)
+SAX.comment(
+	The version of the file, in the format YYYY-NN, where NN will be
+	a number starting with 01 for the first version released in a
+	calendar year, then increasing for each version in that year.
+	)
+SAX.elementDecl(date_of_creation, 3, ...)
+SAX.comment(
+	The date the file was created in international format (YYYY-MM-DD).
+	)
+SAX.elementDecl(character, 4, ...)
+SAX.elementDecl(literal, 3, ...)
+SAX.comment(
+	The character itself in UTF8 coding.
+	)
+SAX.elementDecl(codepoint, 4, ...)
+SAX.comment( 
+	The codepoint element states the code of the character in the various
+	character set standards.
+	)
+SAX.elementDecl(cp_value, 3, ...)
+SAX.comment( 
+	The cp_value contains the codepoint of the character in a particular
+	standard. The standard will be identified in the cp_type attribute.
+	)
+SAX.attributeDecl(cp_value, cp_type, 1, 2, NULL, ...)
+SAX.comment( 
+	The cp_type attribute states the coding standard applying to the
+	element. The values assigned so far are:
+		jis208 - JIS X 0208-1997 - kuten coding (nn-nn)
+		jis212 - JIS X 0212-1990 - kuten coding (nn-nn)
+		jis213 - JIS X 0213-2000 - kuten coding (p-nn-nn)
+		ucs - Unicode 4.0 - hex coding (4 or 5 hexadecimal digits)
+	)
+SAX.elementDecl(radical, 4, ...)
+SAX.elementDecl(rad_value, 3, ...)
+SAX.comment( 
+	The radical number, in the range 1 to 214. The particular
+	classification type is stated in the rad_type attribute.
+	)
+SAX.attributeDecl(rad_value, rad_type, 1, 2, NULL, ...)
+SAX.comment( 
+	The rad_type attribute states the type of radical classification.
+		classical - as recorded in the KangXi Zidian.
+		nelson - as used in the Nelson "Modern Japanese-English 
+		Character Dictionary" (i.e. the Classic, not the New Nelson).
+		This will only be used where Nelson reclassified the kanji.
+	)
+SAX.elementDecl(misc, 4, ...)
+SAX.elementDecl(grade, 3, ...)
+SAX.comment( 
+	The Jouyou Kanji grade level. 1 through 6 indicate the grade in which
+	the kanji is taught in Japanese schools. 8 indicates it is one of the
+	remaining Jouyou Kanji to be learned in junior high school, and 9 
+	indicates it is a Jinmeiyou (for use in names) kanji. [G]
+	)
+SAX.elementDecl(stroke_count, 3, ...)
+SAX.comment( 
+	The stroke count of the kanji, including the radical. If more than 
+	one, the first is considered the accepted count, while subsequent ones 
+	are common miscounts. (See Appendix E. of the KANJIDIC documentation
+	for some of the rules applied when counting strokes in some of the 
+	radicals.) [S]
+	)
+SAX.elementDecl(variant, 3, ...)
+SAX.comment( 
+	A cross-reference code to another kanji, usually regarded as a variant.
+	The type of cross-reference is given in the var_type attribute.
+	)
+SAX.attributeDecl(variant, var_type, 1, 2, NULL, ...)
+SAX.comment( 
+	The var_type attribute indicates the type of variant code. The current
+	values are: 
+		jis208 - in JIS X 0208 - kuten coding
+		jis212 - in JIS X 0212 - kuten coding
+		jis213 - in JIS X 0213 - kuten coding
+		deroo - De Roo number - numeric
+		njecd - Halpern NJECD index number - numeric
+		s_h - The Kanji Dictionary (Spahn & Hadamitzky) - descriptor
+		nelson - "Classic" Nelson - numeric
+		oneill - Japanese Names (O'Neill) - numeric
+	)
+SAX.elementDecl(freq, 3, ...)
+SAX.comment( 
+	A frequency-of-use ranking. The 2,500 most-used characters have a 
+	ranking; those characters that lack this field are not ranked. The 
+	frequency is a number from 1 to 2,500 that expresses the relative 
+	frequency of occurrence of a character in modern Japanese. This is
+	based on a survey in newspapers, so it is biassed towards kanji
+	used in newspaper articles. The discrimination between the less
+	frequently used kanji is not strong.
+	)
+SAX.elementDecl(rad_name, 3, ...)
+SAX.comment( 
+	When the kanji is itself a radical and has a name, this element
+	contains the name (in hiragana.) [T2]
+	)
+SAX.elementDecl(dic_number, 4, ...)
+SAX.comment( 
+	This element contains the index numbers and similar unstructured
+	information such as page numbers in a number of published dictionaries,
+	and instructional books on kanji.
+	)
+SAX.elementDecl(dic_ref, 3, ...)
+SAX.comment( 
+	Each dic_ref contains an index number. The particular dictionary,
+	etc. is defined by the dr_type attribute.
+	)
+SAX.attributeDecl(dic_ref, dr_type, 1, 2, NULL, ...)
+SAX.comment( 
+	The dr_type defines the dictionary or reference book, etc. to which
+	dic_ref element applies. The initial allocation is:
+	  nelson_c - "Modern Reader's Japanese-English Character Dictionary",  
+	  	edited by Andrew Nelson (now published as the "Classic" 
+	  	Nelson).
+	  nelson_n - "The New Nelson Japanese-English Character Dictionary", 
+	  	edited by John Haig.
+	  halpern_njecd - "New Japanese-English Character Dictionary", 
+	  	edited by Jack Halpern.
+	  halpern_kkld - "Kanji Learners Dictionary" (Kodansha) edited by 
+	  	Jack Halpern.
+	  heisig - "Remembering The  Kanji"  by  James Heisig.
+	  gakken - "A  New Dictionary of Kanji Usage" (Gakken)
+	  oneill_names - "Japanese Names", by P.G. O'Neill. 
+	  oneill_kk - "Essential Kanji" by P.G. O'Neill.
+	  moro - "Daikanwajiten" compiled by Morohashi. For some kanji two
+	  	additional attributes are used: m_vol:  the volume of the
+	  	dictionary in which the kanji is found, and m_page: the page
+	  	number in the volume.
+	  henshall - "A Guide To Remembering Japanese Characters" by
+	  	Kenneth G.  Henshall.
+	  sh_kk - "Kanji and Kana" by Spahn and Hadamitzky.
+	  sakade - "A Guide To Reading and Writing Japanese" edited by
+	  	Florence Sakade.
+	  henshall3 - "A Guide To Reading and Writing Japanese" 3rd
+		edition, edited by Henshall, Seeley and De Groot.
+	  tutt_cards - Tuttle Kanji Cards, compiled by Alexander Kask.
+	  crowley - "The Kanji Way to Japanese Language Power" by
+	  	Dale Crowley.
+	  kanji_in_context - "Kanji in Context" by Nishiguchi and Kono.
+	  busy_people - "Japanese For Busy People" vols I-III, published
+		by the AJLT. The codes are the volume.chapter.
+	  kodansha_compact - the "Kodansha Compact Kanji Guide".
+	)
+SAX.attributeDecl(dic_ref, m_vol, 1, 3, NULL, ...)
+SAX.comment( 
+	See above under "moro".
+	)
+SAX.attributeDecl(dic_ref, m_page, 1, 3, NULL, ...)
+SAX.comment( 
+	See above under "moro".
+	)
+SAX.elementDecl(query_code, 4, ...)
+SAX.comment( 
+	These codes contain information relating to the glyph, and can be used
+	for finding a required kanji. The type of code is defined by the
+	qc_type attribute.
+	)
+SAX.elementDecl(q_code, 3, ...)
+SAX.comment(
+	The q_code contains the actual query-code value, according to the
+	qc_type attribute.
+	)
+SAX.attributeDecl(q_code, qc_type, 1, 2, NULL, ...)
+SAX.comment( 
+	The q_code attribute defines the type of query code. The current values
+	are:
+	  skip -  Halpern's SKIP (System  of  Kanji  Indexing  by  Patterns) 
+	  	code. The  format is n-nn-nn.  See the KANJIDIC  documentation 
+	  	for  a description of the code and restrictions on  the 
+	  	commercial  use  of this data. [P]
+
+	  sh_desc - the descriptor codes for The Kanji Dictionary (Tuttle 
+	  	1996) by Spahn and Hadamitzky. They are in the form nxnn.n,  
+	  	e.g.  3k11.2, where the  kanji has 3 strokes in the 
+	  	identifying radical, it is radical "k" in the SH 
+	  	classification system, there are 11 other strokes, and it is 
+	  	the 2nd kanji in the 3k11 sequence. (I am very grateful to 
+	  	Mark Spahn for providing the list of these descriptor codes 
+	  	for the kanji in this file.) [I]
+	  four_corner - the "Four Corner" code for the kanji. This is a code 
+	  	invented by Wang Chen in 1928. See the KANJIDIC documentation 
+	  	for  an overview of  the Four Corner System. [Q]
+
+	  deroo - the codes developed by the late Father Joseph De Roo, and 
+	  	published in  his book "2001 Kanji" (Bojinsha). Fr De Roo 
+	  	gave his permission for these codes to be included. [DR]
+	  misclass - a possible misclassification of the kanji according
+		to one of the code types. (See the "Z" codes in the KANJIDIC
+		documentation for more details.)
+	  
+	)
+SAX.elementDecl(reading_meaning, 4, ...)
+SAX.comment( 
+	The readings for the kanji in several languages, and the meanings, also
+	in several languages. The readings and meanings are grouped to enable
+	the handling of the situation where the meaning is differentiated by 
+	reading. [T1]
+	)
+SAX.elementDecl(nanori, 3, ...)
+SAX.comment( 
+	Japanese readings that are now only associated with names.
+	)
+SAX.elementDecl(rmgroup, 4, ...)
+SAX.elementDecl(reading, 3, ...)
+SAX.comment( 
+	The reading element contains the reading or pronunciation
+	of the kanji.
+	)
+SAX.attributeDecl(reading, r_type, 1, 2, NULL, ...)
+SAX.comment( 
+	The r_type attribute defines the type of reading in the reading
+	element. The current values are:
+	  pinyin - the modern PinYin romanization of the Chinese reading 
+	  	of the kanji. The tones are represented by a concluding 
+	  	digit. [Y]
+	  korean_r - the romanized form of the Korean reading(s) of the 
+	  	kanji.  The readings are in the (Republic of Korea) Ministry 
+	  	of Education style of romanization. [W]
+	  korean_h - the Korean reading(s) of the kanji in hangul.
+	  ja_on - the "on" Japanese reading of the kanji, in katakana. A
+	  	second attribute r_status, if present, will indicate with
+	  	a value of "jy" whether the reading is approved for a
+	  	"Jouyou kanji".
+	  ja_kun - the "kun" Japanese reading of the kanji, in hiragana. 
+	  	Where relevant the okurigana is also included separated by a 
+	  	".". Readings associated with prefixes and suffixes are 
+	  	marked with a "-". A second attribute r_status, if present, 
+	  	will indicate with a value of "jy" whether the reading is 
+	  	approved for a "Jouyou kanji".
+	)
+SAX.attributeDecl(reading, r_status, 1, 3, NULL, ...)
+SAX.comment( 
+	See under ja_on and ja_kun above.
+	)
+SAX.elementDecl(meaning, 3, ...)
+SAX.comment( 
+	The meaning associated with the kanji.
+	)
+SAX.attributeDecl(meaning, m_lang, 1, 3, NULL, ...)
+SAX.comment( 
+	The m_lang attribute defines the target language of the meaning. It 
+	will be coded using the two-letter language code from the ISO 639 
+	standard. When absent, the value "en" (i.e. English) is implied. [{}]
+	)
+SAX.externalSubset(kanjidic2, , )
+SAX.startElement(kanjidic2)
+SAX.characters(
+, 1)
+SAX.endElement(kanjidic2)
+SAX.endDocument()
diff --git a/result/noent/intsubset2.xml b/result/noent/intsubset2.xml
new file mode 100644
index 00000000..b1039553
--- /dev/null
+++ b/result/noent/intsubset2.xml
@@ -0,0 +1,250 @@
+<?xml version="1.0"?>
+<!DOCTYPE kanjidic2 [
+<!-- Version 1.3
+	This is the DTD of the XML-format kanji file combining information from
+	the KANJIDIC and KANJD212 files. It is intended to be largely self-
+	documenting, with each field being accompanied by an explanatory
+	comment.
+
+	The file covers the following kanji:
+	(a) the 6,355 kanji from JIS X 0208;
+	(b) the 5,801 kanji from JIS X 0212;
+	(c) the 3,625 kanji from JIS X 0213 as follows:
+		(i) the 2,741 kanji which are also in JIS X 0212 have
+		JIS X 0213 code-points (kuten) added to the existing entry;
+		(ii) the 884 "new" kanji have new entries.
+
+	At the end of the explanation for a number of fields there is a tag
+	with the format [N]. This indicates the leading letter(s) of the
+	equivalent field in the KANJIDIC and KANJD212 files.
+
+	The KANJIDIC documentation should also be read for additional 
+	information about the information in the file.
+	--><!ELEMENT kanjidic2 (header , character*)>
+<!ELEMENT header (file_version , database_version , date_of_creation)>
+<!--
+	The single header element will contain identification information
+	about the version of the file 
+	--><!ELEMENT file_version (#PCDATA)>
+<!--
+	This field denotes the version of kanjidic2 structure, as more
+	than one version may exist.
+	--><!ELEMENT database_version (#PCDATA)>
+<!--
+	The version of the file, in the format YYYY-NN, where NN will be
+	a number starting with 01 for the first version released in a
+	calendar year, then increasing for each version in that year.
+	--><!ELEMENT date_of_creation (#PCDATA)>
+<!--
+	The date the file was created in international format (YYYY-MM-DD).
+	--><!ELEMENT character (literal , codepoint , radical , misc , dic_number? , query_code? , reading_meaning? , nanori?)*>
+<!ELEMENT literal (#PCDATA)>
+<!--
+	The character itself in UTF8 coding.
+	--><!ELEMENT codepoint (cp_value)+>
+<!-- 
+	The codepoint element states the code of the character in the various
+	character set standards.
+	--><!ELEMENT cp_value (#PCDATA)>
+<!-- 
+	The cp_value contains the codepoint of the character in a particular
+	standard. The standard will be identified in the cp_type attribute.
+	--><!ATTLIST cp_value cp_type CDATA #REQUIRED>
+<!-- 
+	The cp_type attribute states the coding standard applying to the
+	element. The values assigned so far are:
+		jis208 - JIS X 0208-1997 - kuten coding (nn-nn)
+		jis212 - JIS X 0212-1990 - kuten coding (nn-nn)
+		jis213 - JIS X 0213-2000 - kuten coding (p-nn-nn)
+		ucs - Unicode 4.0 - hex coding (4 or 5 hexadecimal digits)
+	--><!ELEMENT radical (rad_value)+>
+<!ELEMENT rad_value (#PCDATA)>
+<!-- 
+	The radical number, in the range 1 to 214. The particular
+	classification type is stated in the rad_type attribute.
+	--><!ATTLIST rad_value rad_type CDATA #REQUIRED>
+<!-- 
+	The rad_type attribute states the type of radical classification.
+		classical - as recorded in the KangXi Zidian.
+		nelson - as used in the Nelson "Modern Japanese-English 
+		Character Dictionary" (i.e. the Classic, not the New Nelson).
+		This will only be used where Nelson reclassified the kanji.
+	--><!ELEMENT misc (grade? , stroke_count+ , variant* , freq* , rad_name*)>
+<!ELEMENT grade (#PCDATA)>
+<!-- 
+	The Jouyou Kanji grade level. 1 through 6 indicate the grade in which
+	the kanji is taught in Japanese schools. 8 indicates it is one of the
+	remaining Jouyou Kanji to be learned in junior high school, and 9 
+	indicates it is a Jinmeiyou (for use in names) kanji. [G]
+	--><!ELEMENT stroke_count (#PCDATA)>
+<!-- 
+	The stroke count of the kanji, including the radical. If more than 
+	one, the first is considered the accepted count, while subsequent ones 
+	are common miscounts. (See Appendix E. of the KANJIDIC documentation
+	for some of the rules applied when counting strokes in some of the 
+	radicals.) [S]
+	--><!ELEMENT variant (#PCDATA)>
+<!-- 
+	A cross-reference code to another kanji, usually regarded as a variant.
+	The type of cross-reference is given in the var_type attribute.
+	--><!ATTLIST variant var_type CDATA #REQUIRED>
+<!-- 
+	The var_type attribute indicates the type of variant code. The current
+	values are: 
+		jis208 - in JIS X 0208 - kuten coding
+		jis212 - in JIS X 0212 - kuten coding
+		jis213 - in JIS X 0213 - kuten coding
+		deroo - De Roo number - numeric
+		njecd - Halpern NJECD index number - numeric
+		s_h - The Kanji Dictionary (Spahn & Hadamitzky) - descriptor
+		nelson - "Classic" Nelson - numeric
+		oneill - Japanese Names (O'Neill) - numeric
+	--><!ELEMENT freq (#PCDATA)>
+<!-- 
+	A frequency-of-use ranking. The 2,500 most-used characters have a 
+	ranking; those characters that lack this field are not ranked. The 
+	frequency is a number from 1 to 2,500 that expresses the relative 
+	frequency of occurrence of a character in modern Japanese. This is
+	based on a survey in newspapers, so it is biassed towards kanji
+	used in newspaper articles. The discrimination between the less
+	frequently used kanji is not strong.
+	--><!ELEMENT rad_name (#PCDATA)>
+<!-- 
+	When the kanji is itself a radical and has a name, this element
+	contains the name (in hiragana.) [T2]
+	--><!ELEMENT dic_number (dic_ref)+>
+<!-- 
+	This element contains the index numbers and similar unstructured
+	information such as page numbers in a number of published dictionaries,
+	and instructional books on kanji.
+	--><!ELEMENT dic_ref (#PCDATA)>
+<!-- 
+	Each dic_ref contains an index number. The particular dictionary,
+	etc. is defined by the dr_type attribute.
+	--><!ATTLIST dic_ref dr_type CDATA #REQUIRED>
+<!-- 
+	The dr_type defines the dictionary or reference book, etc. to which
+	dic_ref element applies. The initial allocation is:
+	  nelson_c - "Modern Reader's Japanese-English Character Dictionary",  
+	  	edited by Andrew Nelson (now published as the "Classic" 
+	  	Nelson).
+	  nelson_n - "The New Nelson Japanese-English Character Dictionary", 
+	  	edited by John Haig.
+	  halpern_njecd - "New Japanese-English Character Dictionary", 
+	  	edited by Jack Halpern.
+	  halpern_kkld - "Kanji Learners Dictionary" (Kodansha) edited by 
+	  	Jack Halpern.
+	  heisig - "Remembering The  Kanji"  by  James Heisig.
+	  gakken - "A  New Dictionary of Kanji Usage" (Gakken)
+	  oneill_names - "Japanese Names", by P.G. O'Neill. 
+	  oneill_kk - "Essential Kanji" by P.G. O'Neill.
+	  moro - "Daikanwajiten" compiled by Morohashi. For some kanji two
+	  	additional attributes are used: m_vol:  the volume of the
+	  	dictionary in which the kanji is found, and m_page: the page
+	  	number in the volume.
+	  henshall - "A Guide To Remembering Japanese Characters" by
+	  	Kenneth G.  Henshall.
+	  sh_kk - "Kanji and Kana" by Spahn and Hadamitzky.
+	  sakade - "A Guide To Reading and Writing Japanese" edited by
+	  	Florence Sakade.
+	  henshall3 - "A Guide To Reading and Writing Japanese" 3rd
+		edition, edited by Henshall, Seeley and De Groot.
+	  tutt_cards - Tuttle Kanji Cards, compiled by Alexander Kask.
+	  crowley - "The Kanji Way to Japanese Language Power" by
+	  	Dale Crowley.
+	  kanji_in_context - "Kanji in Context" by Nishiguchi and Kono.
+	  busy_people - "Japanese For Busy People" vols I-III, published
+		by the AJLT. The codes are the volume.chapter.
+	  kodansha_compact - the "Kodansha Compact Kanji Guide".
+	--><!ATTLIST dic_ref m_vol CDATA #IMPLIED>
+<!-- 
+	See above under "moro".
+	--><!ATTLIST dic_ref m_page CDATA #IMPLIED>
+<!-- 
+	See above under "moro".
+	--><!ELEMENT query_code (q_code)+>
+<!-- 
+	These codes contain information relating to the glyph, and can be used
+	for finding a required kanji. The type of code is defined by the
+	qc_type attribute.
+	--><!ELEMENT q_code (#PCDATA)>
+<!--
+	The q_code contains the actual query-code value, according to the
+	qc_type attribute.
+	--><!ATTLIST q_code qc_type CDATA #REQUIRED>
+<!-- 
+	The q_code attribute defines the type of query code. The current values
+	are:
+	  skip -  Halpern's SKIP (System  of  Kanji  Indexing  by  Patterns) 
+	  	code. The  format is n-nn-nn.  See the KANJIDIC  documentation 
+	  	for  a description of the code and restrictions on  the 
+	  	commercial  use  of this data. [P]
+
+	  sh_desc - the descriptor codes for The Kanji Dictionary (Tuttle 
+	  	1996) by Spahn and Hadamitzky. They are in the form nxnn.n,  
+	  	e.g.  3k11.2, where the  kanji has 3 strokes in the 
+	  	identifying radical, it is radical "k" in the SH 
+	  	classification system, there are 11 other strokes, and it is 
+	  	the 2nd kanji in the 3k11 sequence. (I am very grateful to 
+	  	Mark Spahn for providing the list of these descriptor codes 
+	  	for the kanji in this file.) [I]
+	  four_corner - the "Four Corner" code for the kanji. This is a code 
+	  	invented by Wang Chen in 1928. See the KANJIDIC documentation 
+	  	for  an overview of  the Four Corner System. [Q]
+
+	  deroo - the codes developed by the late Father Joseph De Roo, and 
+	  	published in  his book "2001 Kanji" (Bojinsha). Fr De Roo 
+	  	gave his permission for these codes to be included. [DR]
+	  misclass - a possible misclassification of the kanji according
+		to one of the code types. (See the "Z" codes in the KANJIDIC
+		documentation for more details.)
+	  
+	--><!ELEMENT reading_meaning (rmgroup* , nanori*)>
+<!-- 
+	The readings for the kanji in several languages, and the meanings, also
+	in several languages. The readings and meanings are grouped to enable
+	the handling of the situation where the meaning is differentiated by 
+	reading. [T1]
+	--><!ELEMENT nanori (#PCDATA)>
+<!-- 
+	Japanese readings that are now only associated with names.
+	--><!ELEMENT rmgroup (reading* , meaning*)>
+<!ELEMENT reading (#PCDATA)>
+<!-- 
+	The reading element contains the reading or pronunciation
+	of the kanji.
+	--><!ATTLIST reading r_type CDATA #REQUIRED>
+<!-- 
+	The r_type attribute defines the type of reading in the reading
+	element. The current values are:
+	  pinyin - the modern PinYin romanization of the Chinese reading 
+	  	of the kanji. The tones are represented by a concluding 
+	  	digit. [Y]
+	  korean_r - the romanized form of the Korean reading(s) of the 
+	  	kanji.  The readings are in the (Republic of Korea) Ministry 
+	  	of Education style of romanization. [W]
+	  korean_h - the Korean reading(s) of the kanji in hangul.
+	  ja_on - the "on" Japanese reading of the kanji, in katakana. A
+	  	second attribute r_status, if present, will indicate with
+	  	a value of "jy" whether the reading is approved for a
+	  	"Jouyou kanji".
+	  ja_kun - the "kun" Japanese reading of the kanji, in hiragana. 
+	  	Where relevant the okurigana is also included separated by a 
+	  	".". Readings associated with prefixes and suffixes are 
+	  	marked with a "-". A second attribute r_status, if present, 
+	  	will indicate with a value of "jy" whether the reading is 
+	  	approved for a "Jouyou kanji".
+	--><!ATTLIST reading r_status CDATA #IMPLIED>
+<!-- 
+	See under ja_on and ja_kun above.
+	--><!ELEMENT meaning (#PCDATA)>
+<!-- 
+	The meaning associated with the kanji.
+	--><!ATTLIST meaning m_lang CDATA #IMPLIED>
+<!-- 
+	The m_lang attribute defines the target language of the meaning. It 
+	will be coded using the two-letter language code from the ISO 639 
+	standard. When absent, the value "en" (i.e. English) is implied. [{}]
+	-->]>
+<kanjidic2>
+</kanjidic2>
diff --git a/test/intsubset2.xml b/test/intsubset2.xml
new file mode 100644
index 00000000..4ae845a0
--- /dev/null
+++ b/test/intsubset2.xml
@@ -0,0 +1,282 @@
+<?xml version="1.0"?>
+<!DOCTYPE kanjidic2 [
+	<!-- Version 1.3
+	This is the DTD of the XML-format kanji file combining information from
+	the KANJIDIC and KANJD212 files. It is intended to be largely self-
+	documenting, with each field being accompanied by an explanatory
+	comment.
+
+	The file covers the following kanji:
+	(a) the 6,355 kanji from JIS X 0208;
+	(b) the 5,801 kanji from JIS X 0212;
+	(c) the 3,625 kanji from JIS X 0213 as follows:
+		(i) the 2,741 kanji which are also in JIS X 0212 have
+		JIS X 0213 code-points (kuten) added to the existing entry;
+		(ii) the 884 "new" kanji have new entries.
+
+	At the end of the explanation for a number of fields there is a tag
+	with the format [N]. This indicates the leading letter(s) of the
+	equivalent field in the KANJIDIC and KANJD212 files.
+
+	The KANJIDIC documentation should also be read for additional 
+	information about the information in the file.
+	-->
+<!ELEMENT kanjidic2 (header,character*)>
+<!ELEMENT header (file_version,database_version,date_of_creation)>
+<!--
+	The single header element will contain identification information
+	about the version of the file 
+	-->
+<!ELEMENT file_version (#PCDATA)>
+<!--
+	This field denotes the version of kanjidic2 structure, as more
+	than one version may exist.
+	-->
+<!ELEMENT database_version (#PCDATA)>
+<!--
+	The version of the file, in the format YYYY-NN, where NN will be
+	a number starting with 01 for the first version released in a
+	calendar year, then increasing for each version in that year.
+	-->
+<!ELEMENT date_of_creation (#PCDATA)>
+<!--
+	The date the file was created in international format (YYYY-MM-DD).
+	-->
+<!ELEMENT character (literal,codepoint, radical, misc, dic_number?, query_code?, reading_meaning?,nanori?)*>
+<!ELEMENT literal (#PCDATA)>
+<!--
+	The character itself in UTF8 coding.
+	-->
+<!ELEMENT codepoint (cp_value+)>
+	<!-- 
+	The codepoint element states the code of the character in the various
+	character set standards.
+	-->
+<!ELEMENT cp_value (#PCDATA)>
+	<!-- 
+	The cp_value contains the codepoint of the character in a particular
+	standard. The standard will be identified in the cp_type attribute.
+	-->
+<!ATTLIST cp_value cp_type CDATA #REQUIRED>
+	<!-- 
+	The cp_type attribute states the coding standard applying to the
+	element. The values assigned so far are:
+		jis208 - JIS X 0208-1997 - kuten coding (nn-nn)
+		jis212 - JIS X 0212-1990 - kuten coding (nn-nn)
+		jis213 - JIS X 0213-2000 - kuten coding (p-nn-nn)
+		ucs - Unicode 4.0 - hex coding (4 or 5 hexadecimal digits)
+	-->
+<!ELEMENT radical (rad_value+)>
+<!ELEMENT rad_value (#PCDATA)>
+	<!-- 
+	The radical number, in the range 1 to 214. The particular
+	classification type is stated in the rad_type attribute.
+	-->
+<!ATTLIST rad_value rad_type CDATA #REQUIRED>
+	<!-- 
+	The rad_type attribute states the type of radical classification.
+		classical - as recorded in the KangXi Zidian.
+		nelson - as used in the Nelson "Modern Japanese-English 
+		Character Dictionary" (i.e. the Classic, not the New Nelson).
+		This will only be used where Nelson reclassified the kanji.
+	-->
+<!ELEMENT misc (grade?, stroke_count+, variant*, freq*, rad_name*)>
+<!ELEMENT grade (#PCDATA)>
+	<!-- 
+	The Jouyou Kanji grade level. 1 through 6 indicate the grade in which
+	the kanji is taught in Japanese schools. 8 indicates it is one of the
+	remaining Jouyou Kanji to be learned in junior high school, and 9 
+	indicates it is a Jinmeiyou (for use in names) kanji. [G]
+	-->
+<!ELEMENT stroke_count (#PCDATA)>
+	<!-- 
+	The stroke count of the kanji, including the radical. If more than 
+	one, the first is considered the accepted count, while subsequent ones 
+	are common miscounts. (See Appendix E. of the KANJIDIC documentation
+	for some of the rules applied when counting strokes in some of the 
+	radicals.) [S]
+	-->
+<!ELEMENT variant (#PCDATA)>
+	<!-- 
+	A cross-reference code to another kanji, usually regarded as a variant.
+	The type of cross-reference is given in the var_type attribute.
+	-->
+<!ATTLIST variant var_type CDATA #REQUIRED>
+	<!-- 
+	The var_type attribute indicates the type of variant code. The current
+	values are: 
+		jis208 - in JIS X 0208 - kuten coding
+		jis212 - in JIS X 0212 - kuten coding
+		jis213 - in JIS X 0213 - kuten coding
+		deroo - De Roo number - numeric
+		njecd - Halpern NJECD index number - numeric
+		s_h - The Kanji Dictionary (Spahn & Hadamitzky) - descriptor
+		nelson - "Classic" Nelson - numeric
+		oneill - Japanese Names (O'Neill) - numeric
+	-->
+<!ELEMENT freq (#PCDATA)>
+	<!-- 
+	A frequency-of-use ranking. The 2,500 most-used characters have a 
+	ranking; those characters that lack this field are not ranked. The 
+	frequency is a number from 1 to 2,500 that expresses the relative 
+	frequency of occurrence of a character in modern Japanese. This is
+	based on a survey in newspapers, so it is biassed towards kanji
+	used in newspaper articles. The discrimination between the less
+	frequently used kanji is not strong.
+	-->
+<!ELEMENT rad_name (#PCDATA)>
+	<!-- 
+	When the kanji is itself a radical and has a name, this element
+	contains the name (in hiragana.) [T2]
+	-->
+<!ELEMENT dic_number (dic_ref+)>
+	<!-- 
+	This element contains the index numbers and similar unstructured
+	information such as page numbers in a number of published dictionaries,
+	and instructional books on kanji.
+	-->
+<!ELEMENT dic_ref (#PCDATA)>
+	<!-- 
+	Each dic_ref contains an index number. The particular dictionary,
+	etc. is defined by the dr_type attribute.
+	-->
+<!ATTLIST dic_ref dr_type CDATA #REQUIRED>
+	<!-- 
+	The dr_type defines the dictionary or reference book, etc. to which
+	dic_ref element applies. The initial allocation is:
+	  nelson_c - "Modern Reader's Japanese-English Character Dictionary",  
+	  	edited by Andrew Nelson (now published as the "Classic" 
+	  	Nelson).
+	  nelson_n - "The New Nelson Japanese-English Character Dictionary", 
+	  	edited by John Haig.
+	  halpern_njecd - "New Japanese-English Character Dictionary", 
+	  	edited by Jack Halpern.
+	  halpern_kkld - "Kanji Learners Dictionary" (Kodansha) edited by 
+	  	Jack Halpern.
+	  heisig - "Remembering The  Kanji"  by  James Heisig.
+	  gakken - "A  New Dictionary of Kanji Usage" (Gakken)
+	  oneill_names - "Japanese Names", by P.G. O'Neill. 
+	  oneill_kk - "Essential Kanji" by P.G. O'Neill.
+	  moro - "Daikanwajiten" compiled by Morohashi. For some kanji two
+	  	additional attributes are used: m_vol:  the volume of the
+	  	dictionary in which the kanji is found, and m_page: the page
+	  	number in the volume.
+	  henshall - "A Guide To Remembering Japanese Characters" by
+	  	Kenneth G.  Henshall.
+	  sh_kk - "Kanji and Kana" by Spahn and Hadamitzky.
+	  sakade - "A Guide To Reading and Writing Japanese" edited by
+	  	Florence Sakade.
+	  henshall3 - "A Guide To Reading and Writing Japanese" 3rd
+		edition, edited by Henshall, Seeley and De Groot.
+	  tutt_cards - Tuttle Kanji Cards, compiled by Alexander Kask.
+	  crowley - "The Kanji Way to Japanese Language Power" by
+	  	Dale Crowley.
+	  kanji_in_context - "Kanji in Context" by Nishiguchi and Kono.
+	  busy_people - "Japanese For Busy People" vols I-III, published
+		by the AJLT. The codes are the volume.chapter.
+	  kodansha_compact - the "Kodansha Compact Kanji Guide".
+	-->
+<!ATTLIST dic_ref m_vol CDATA #IMPLIED>
+	<!-- 
+	See above under "moro".
+	-->
+<!ATTLIST dic_ref m_page CDATA #IMPLIED>
+	<!-- 
+	See above under "moro".
+	-->
+<!ELEMENT query_code (q_code+)>
+	<!-- 
+	These codes contain information relating to the glyph, and can be used
+	for finding a required kanji. The type of code is defined by the
+	qc_type attribute.
+	-->
+<!ELEMENT q_code (#PCDATA)>
+	<!--
+	The q_code contains the actual query-code value, according to the
+	qc_type attribute.
+	-->
+<!ATTLIST q_code qc_type CDATA #REQUIRED>
+	<!-- 
+	The q_code attribute defines the type of query code. The current values
+	are:
+	  skip -  Halpern's SKIP (System  of  Kanji  Indexing  by  Patterns) 
+	  	code. The  format is n-nn-nn.  See the KANJIDIC  documentation 
+	  	for  a description of the code and restrictions on  the 
+	  	commercial  use  of this data. [P]
+
+	  sh_desc - the descriptor codes for The Kanji Dictionary (Tuttle 
+	  	1996) by Spahn and Hadamitzky. They are in the form nxnn.n,  
+	  	e.g.  3k11.2, where the  kanji has 3 strokes in the 
+	  	identifying radical, it is radical "k" in the SH 
+	  	classification system, there are 11 other strokes, and it is 
+	  	the 2nd kanji in the 3k11 sequence. (I am very grateful to 
+	  	Mark Spahn for providing the list of these descriptor codes 
+	  	for the kanji in this file.) [I]
+	  four_corner - the "Four Corner" code for the kanji. This is a code 
+	  	invented by Wang Chen in 1928. See the KANJIDIC documentation 
+	  	for  an overview of  the Four Corner System. [Q]
+
+	  deroo - the codes developed by the late Father Joseph De Roo, and 
+	  	published in  his book "2001 Kanji" (Bojinsha). Fr De Roo 
+	  	gave his permission for these codes to be included. [DR]
+	  misclass - a possible misclassification of the kanji according
+		to one of the code types. (See the "Z" codes in the KANJIDIC
+		documentation for more details.)
+	  
+	-->
+<!ELEMENT reading_meaning (rmgroup*, nanori*)>
+	<!-- 
+	The readings for the kanji in several languages, and the meanings, also
+	in several languages. The readings and meanings are grouped to enable
+	the handling of the situation where the meaning is differentiated by 
+	reading. [T1]
+	-->
+<!ELEMENT nanori (#PCDATA)>
+	<!-- 
+	Japanese readings that are now only associated with names.
+	-->
+<!ELEMENT rmgroup (reading*, meaning*)>
+<!ELEMENT reading (#PCDATA)>
+	<!-- 
+	The reading element contains the reading or pronunciation
+	of the kanji.
+	-->
+<!ATTLIST reading r_type CDATA #REQUIRED>
+	<!-- 
+	The r_type attribute defines the type of reading in the reading
+	element. The current values are:
+	  pinyin - the modern PinYin romanization of the Chinese reading 
+	  	of the kanji. The tones are represented by a concluding 
+	  	digit. [Y]
+	  korean_r - the romanized form of the Korean reading(s) of the 
+	  	kanji.  The readings are in the (Republic of Korea) Ministry 
+	  	of Education style of romanization. [W]
+	  korean_h - the Korean reading(s) of the kanji in hangul.
+	  ja_on - the "on" Japanese reading of the kanji, in katakana. A
+	  	second attribute r_status, if present, will indicate with
+	  	a value of "jy" whether the reading is approved for a
+	  	"Jouyou kanji".
+	  ja_kun - the "kun" Japanese reading of the kanji, in hiragana. 
+	  	Where relevant the okurigana is also included separated by a 
+	  	".". Readings associated with prefixes and suffixes are 
+	  	marked with a "-". A second attribute r_status, if present, 
+	  	will indicate with a value of "jy" whether the reading is 
+	  	approved for a "Jouyou kanji".
+	-->
+<!ATTLIST reading r_status CDATA #IMPLIED>
+	<!-- 
+	See under ja_on and ja_kun above.
+	-->
+<!ELEMENT meaning (#PCDATA)>
+	<!-- 
+	The meaning associated with the kanji.
+	-->
+<!ATTLIST meaning m_lang CDATA #IMPLIED>
+	<!-- 
+	The m_lang attribute defines the target language of the meaning. It 
+	will be coded using the two-letter language code from the ISO 639 
+	standard. When absent, the value "en" (i.e. English) is implied. [{}]
+	-->
+] >
+<kanjidic2>
+</kanjidic2>