mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2024-12-23 17:33:50 +03:00
found and fixed 2 problems in the internal subset scanning code affecting
* parser.c: found and fixed 2 problems in the internal subset scanning code affecting the push parser (and the reader), fixes #165126 * test/intsubset2.xml result//intsubset2.xml*: added the test case to the regression tests. Daniel
This commit is contained in:
parent
cee2b3a5f1
commit
8f8a9dd7f1
@ -1,3 +1,10 @@
|
||||
Tue Jan 25 22:39:33 CET 2005 Daniel Veillard <daniel@veillard.com>
|
||||
|
||||
* parser.c: found and fixed 2 problems in the internal subset scanning
|
||||
code affecting the push parser (and the reader), fixes #165126
|
||||
* test/intsubset2.xml result//intsubset2.xml*: added the test case
|
||||
to the regression tests.
|
||||
|
||||
Tue Jan 25 01:20:11 CET 2005 Daniel Veillard <daniel@veillard.com>
|
||||
|
||||
* testdso.c xmlregexp.c: warning patches from Peter Breitenlohner
|
||||
|
34
parser.c
34
parser.c
@ -9861,8 +9861,12 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found)
|
||||
break;
|
||||
if (!found) {
|
||||
#if 0
|
||||
fprintf(stderr, "unfinished comment\n");
|
||||
#endif
|
||||
break; /* for */
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@ -9875,6 +9879,10 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
|
||||
continue;
|
||||
}
|
||||
if (buf[base] == ']') {
|
||||
#if 0
|
||||
fprintf(stderr, "%c%c%c%c: ", buf[base],
|
||||
buf[base + 1], buf[base + 2], buf[base + 3]);
|
||||
#endif
|
||||
if ((unsigned int) base +1 >=
|
||||
ctxt->input->buf->buffer->use)
|
||||
break;
|
||||
@ -9883,20 +9891,34 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
|
||||
base++;
|
||||
continue;
|
||||
}
|
||||
for (i = 0;
|
||||
for (i = 1;
|
||||
(unsigned int) base + i < ctxt->input->buf->buffer->use;
|
||||
i++) {
|
||||
if (buf[base + i] == '>')
|
||||
if (buf[base + i] == '>') {
|
||||
#if 0
|
||||
fprintf(stderr, "found\n");
|
||||
#endif
|
||||
goto found_end_int_subset;
|
||||
}
|
||||
if (!IS_BLANK_CH(buf[base + i])) {
|
||||
#if 0
|
||||
fprintf(stderr, "not found\n");
|
||||
#endif
|
||||
goto not_end_of_int_subset;
|
||||
}
|
||||
}
|
||||
#if 0
|
||||
fprintf(stderr, "end of stream\n");
|
||||
#endif
|
||||
break;
|
||||
|
||||
}
|
||||
not_end_of_int_subset:
|
||||
continue; /* for */
|
||||
}
|
||||
/*
|
||||
* We didn't found the end of the Internal subset
|
||||
*/
|
||||
if (quote == 0)
|
||||
ctxt->checkIndex = base;
|
||||
#ifdef DEBUG_PUSH
|
||||
if (next == 0)
|
||||
xmlGenericError(xmlGenericErrorContext,
|
||||
|
250
result/intsubset2.xml
Normal file
250
result/intsubset2.xml
Normal file
@ -0,0 +1,250 @@
|
||||
<?xml version="1.0"?>
|
||||
<!DOCTYPE kanjidic2 [
|
||||
<!-- Version 1.3
|
||||
This is the DTD of the XML-format kanji file combining information from
|
||||
the KANJIDIC and KANJD212 files. It is intended to be largely self-
|
||||
documenting, with each field being accompanied by an explanatory
|
||||
comment.
|
||||
|
||||
The file covers the following kanji:
|
||||
(a) the 6,355 kanji from JIS X 0208;
|
||||
(b) the 5,801 kanji from JIS X 0212;
|
||||
(c) the 3,625 kanji from JIS X 0213 as follows:
|
||||
(i) the 2,741 kanji which are also in JIS X 0212 have
|
||||
JIS X 0213 code-points (kuten) added to the existing entry;
|
||||
(ii) the 884 "new" kanji have new entries.
|
||||
|
||||
At the end of the explanation for a number of fields there is a tag
|
||||
with the format [N]. This indicates the leading letter(s) of the
|
||||
equivalent field in the KANJIDIC and KANJD212 files.
|
||||
|
||||
The KANJIDIC documentation should also be read for additional
|
||||
information about the information in the file.
|
||||
--><!ELEMENT kanjidic2 (header , character*)>
|
||||
<!ELEMENT header (file_version , database_version , date_of_creation)>
|
||||
<!--
|
||||
The single header element will contain identification information
|
||||
about the version of the file
|
||||
--><!ELEMENT file_version (#PCDATA)>
|
||||
<!--
|
||||
This field denotes the version of kanjidic2 structure, as more
|
||||
than one version may exist.
|
||||
--><!ELEMENT database_version (#PCDATA)>
|
||||
<!--
|
||||
The version of the file, in the format YYYY-NN, where NN will be
|
||||
a number starting with 01 for the first version released in a
|
||||
calendar year, then increasing for each version in that year.
|
||||
--><!ELEMENT date_of_creation (#PCDATA)>
|
||||
<!--
|
||||
The date the file was created in international format (YYYY-MM-DD).
|
||||
--><!ELEMENT character (literal , codepoint , radical , misc , dic_number? , query_code? , reading_meaning? , nanori?)*>
|
||||
<!ELEMENT literal (#PCDATA)>
|
||||
<!--
|
||||
The character itself in UTF8 coding.
|
||||
--><!ELEMENT codepoint (cp_value)+>
|
||||
<!--
|
||||
The codepoint element states the code of the character in the various
|
||||
character set standards.
|
||||
--><!ELEMENT cp_value (#PCDATA)>
|
||||
<!--
|
||||
The cp_value contains the codepoint of the character in a particular
|
||||
standard. The standard will be identified in the cp_type attribute.
|
||||
--><!ATTLIST cp_value cp_type CDATA #REQUIRED>
|
||||
<!--
|
||||
The cp_type attribute states the coding standard applying to the
|
||||
element. The values assigned so far are:
|
||||
jis208 - JIS X 0208-1997 - kuten coding (nn-nn)
|
||||
jis212 - JIS X 0212-1990 - kuten coding (nn-nn)
|
||||
jis213 - JIS X 0213-2000 - kuten coding (p-nn-nn)
|
||||
ucs - Unicode 4.0 - hex coding (4 or 5 hexadecimal digits)
|
||||
--><!ELEMENT radical (rad_value)+>
|
||||
<!ELEMENT rad_value (#PCDATA)>
|
||||
<!--
|
||||
The radical number, in the range 1 to 214. The particular
|
||||
classification type is stated in the rad_type attribute.
|
||||
--><!ATTLIST rad_value rad_type CDATA #REQUIRED>
|
||||
<!--
|
||||
The rad_type attribute states the type of radical classification.
|
||||
classical - as recorded in the KangXi Zidian.
|
||||
nelson - as used in the Nelson "Modern Japanese-English
|
||||
Character Dictionary" (i.e. the Classic, not the New Nelson).
|
||||
This will only be used where Nelson reclassified the kanji.
|
||||
--><!ELEMENT misc (grade? , stroke_count+ , variant* , freq* , rad_name*)>
|
||||
<!ELEMENT grade (#PCDATA)>
|
||||
<!--
|
||||
The Jouyou Kanji grade level. 1 through 6 indicate the grade in which
|
||||
the kanji is taught in Japanese schools. 8 indicates it is one of the
|
||||
remaining Jouyou Kanji to be learned in junior high school, and 9
|
||||
indicates it is a Jinmeiyou (for use in names) kanji. [G]
|
||||
--><!ELEMENT stroke_count (#PCDATA)>
|
||||
<!--
|
||||
The stroke count of the kanji, including the radical. If more than
|
||||
one, the first is considered the accepted count, while subsequent ones
|
||||
are common miscounts. (See Appendix E. of the KANJIDIC documentation
|
||||
for some of the rules applied when counting strokes in some of the
|
||||
radicals.) [S]
|
||||
--><!ELEMENT variant (#PCDATA)>
|
||||
<!--
|
||||
A cross-reference code to another kanji, usually regarded as a variant.
|
||||
The type of cross-reference is given in the var_type attribute.
|
||||
--><!ATTLIST variant var_type CDATA #REQUIRED>
|
||||
<!--
|
||||
The var_type attribute indicates the type of variant code. The current
|
||||
values are:
|
||||
jis208 - in JIS X 0208 - kuten coding
|
||||
jis212 - in JIS X 0212 - kuten coding
|
||||
jis213 - in JIS X 0213 - kuten coding
|
||||
deroo - De Roo number - numeric
|
||||
njecd - Halpern NJECD index number - numeric
|
||||
s_h - The Kanji Dictionary (Spahn & Hadamitzky) - descriptor
|
||||
nelson - "Classic" Nelson - numeric
|
||||
oneill - Japanese Names (O'Neill) - numeric
|
||||
--><!ELEMENT freq (#PCDATA)>
|
||||
<!--
|
||||
A frequency-of-use ranking. The 2,500 most-used characters have a
|
||||
ranking; those characters that lack this field are not ranked. The
|
||||
frequency is a number from 1 to 2,500 that expresses the relative
|
||||
frequency of occurrence of a character in modern Japanese. This is
|
||||
based on a survey in newspapers, so it is biassed towards kanji
|
||||
used in newspaper articles. The discrimination between the less
|
||||
frequently used kanji is not strong.
|
||||
--><!ELEMENT rad_name (#PCDATA)>
|
||||
<!--
|
||||
When the kanji is itself a radical and has a name, this element
|
||||
contains the name (in hiragana.) [T2]
|
||||
--><!ELEMENT dic_number (dic_ref)+>
|
||||
<!--
|
||||
This element contains the index numbers and similar unstructured
|
||||
information such as page numbers in a number of published dictionaries,
|
||||
and instructional books on kanji.
|
||||
--><!ELEMENT dic_ref (#PCDATA)>
|
||||
<!--
|
||||
Each dic_ref contains an index number. The particular dictionary,
|
||||
etc. is defined by the dr_type attribute.
|
||||
--><!ATTLIST dic_ref dr_type CDATA #REQUIRED>
|
||||
<!--
|
||||
The dr_type defines the dictionary or reference book, etc. to which
|
||||
dic_ref element applies. The initial allocation is:
|
||||
nelson_c - "Modern Reader's Japanese-English Character Dictionary",
|
||||
edited by Andrew Nelson (now published as the "Classic"
|
||||
Nelson).
|
||||
nelson_n - "The New Nelson Japanese-English Character Dictionary",
|
||||
edited by John Haig.
|
||||
halpern_njecd - "New Japanese-English Character Dictionary",
|
||||
edited by Jack Halpern.
|
||||
halpern_kkld - "Kanji Learners Dictionary" (Kodansha) edited by
|
||||
Jack Halpern.
|
||||
heisig - "Remembering The Kanji" by James Heisig.
|
||||
gakken - "A New Dictionary of Kanji Usage" (Gakken)
|
||||
oneill_names - "Japanese Names", by P.G. O'Neill.
|
||||
oneill_kk - "Essential Kanji" by P.G. O'Neill.
|
||||
moro - "Daikanwajiten" compiled by Morohashi. For some kanji two
|
||||
additional attributes are used: m_vol: the volume of the
|
||||
dictionary in which the kanji is found, and m_page: the page
|
||||
number in the volume.
|
||||
henshall - "A Guide To Remembering Japanese Characters" by
|
||||
Kenneth G. Henshall.
|
||||
sh_kk - "Kanji and Kana" by Spahn and Hadamitzky.
|
||||
sakade - "A Guide To Reading and Writing Japanese" edited by
|
||||
Florence Sakade.
|
||||
henshall3 - "A Guide To Reading and Writing Japanese" 3rd
|
||||
edition, edited by Henshall, Seeley and De Groot.
|
||||
tutt_cards - Tuttle Kanji Cards, compiled by Alexander Kask.
|
||||
crowley - "The Kanji Way to Japanese Language Power" by
|
||||
Dale Crowley.
|
||||
kanji_in_context - "Kanji in Context" by Nishiguchi and Kono.
|
||||
busy_people - "Japanese For Busy People" vols I-III, published
|
||||
by the AJLT. The codes are the volume.chapter.
|
||||
kodansha_compact - the "Kodansha Compact Kanji Guide".
|
||||
--><!ATTLIST dic_ref m_vol CDATA #IMPLIED>
|
||||
<!--
|
||||
See above under "moro".
|
||||
--><!ATTLIST dic_ref m_page CDATA #IMPLIED>
|
||||
<!--
|
||||
See above under "moro".
|
||||
--><!ELEMENT query_code (q_code)+>
|
||||
<!--
|
||||
These codes contain information relating to the glyph, and can be used
|
||||
for finding a required kanji. The type of code is defined by the
|
||||
qc_type attribute.
|
||||
--><!ELEMENT q_code (#PCDATA)>
|
||||
<!--
|
||||
The q_code contains the actual query-code value, according to the
|
||||
qc_type attribute.
|
||||
--><!ATTLIST q_code qc_type CDATA #REQUIRED>
|
||||
<!--
|
||||
The q_code attribute defines the type of query code. The current values
|
||||
are:
|
||||
skip - Halpern's SKIP (System of Kanji Indexing by Patterns)
|
||||
code. The format is n-nn-nn. See the KANJIDIC documentation
|
||||
for a description of the code and restrictions on the
|
||||
commercial use of this data. [P]
|
||||
|
||||
sh_desc - the descriptor codes for The Kanji Dictionary (Tuttle
|
||||
1996) by Spahn and Hadamitzky. They are in the form nxnn.n,
|
||||
e.g. 3k11.2, where the kanji has 3 strokes in the
|
||||
identifying radical, it is radical "k" in the SH
|
||||
classification system, there are 11 other strokes, and it is
|
||||
the 2nd kanji in the 3k11 sequence. (I am very grateful to
|
||||
Mark Spahn for providing the list of these descriptor codes
|
||||
for the kanji in this file.) [I]
|
||||
four_corner - the "Four Corner" code for the kanji. This is a code
|
||||
invented by Wang Chen in 1928. See the KANJIDIC documentation
|
||||
for an overview of the Four Corner System. [Q]
|
||||
|
||||
deroo - the codes developed by the late Father Joseph De Roo, and
|
||||
published in his book "2001 Kanji" (Bojinsha). Fr De Roo
|
||||
gave his permission for these codes to be included. [DR]
|
||||
misclass - a possible misclassification of the kanji according
|
||||
to one of the code types. (See the "Z" codes in the KANJIDIC
|
||||
documentation for more details.)
|
||||
|
||||
--><!ELEMENT reading_meaning (rmgroup* , nanori*)>
|
||||
<!--
|
||||
The readings for the kanji in several languages, and the meanings, also
|
||||
in several languages. The readings and meanings are grouped to enable
|
||||
the handling of the situation where the meaning is differentiated by
|
||||
reading. [T1]
|
||||
--><!ELEMENT nanori (#PCDATA)>
|
||||
<!--
|
||||
Japanese readings that are now only associated with names.
|
||||
--><!ELEMENT rmgroup (reading* , meaning*)>
|
||||
<!ELEMENT reading (#PCDATA)>
|
||||
<!--
|
||||
The reading element contains the reading or pronunciation
|
||||
of the kanji.
|
||||
--><!ATTLIST reading r_type CDATA #REQUIRED>
|
||||
<!--
|
||||
The r_type attribute defines the type of reading in the reading
|
||||
element. The current values are:
|
||||
pinyin - the modern PinYin romanization of the Chinese reading
|
||||
of the kanji. The tones are represented by a concluding
|
||||
digit. [Y]
|
||||
korean_r - the romanized form of the Korean reading(s) of the
|
||||
kanji. The readings are in the (Republic of Korea) Ministry
|
||||
of Education style of romanization. [W]
|
||||
korean_h - the Korean reading(s) of the kanji in hangul.
|
||||
ja_on - the "on" Japanese reading of the kanji, in katakana. A
|
||||
second attribute r_status, if present, will indicate with
|
||||
a value of "jy" whether the reading is approved for a
|
||||
"Jouyou kanji".
|
||||
ja_kun - the "kun" Japanese reading of the kanji, in hiragana.
|
||||
Where relevant the okurigana is also included separated by a
|
||||
".". Readings associated with prefixes and suffixes are
|
||||
marked with a "-". A second attribute r_status, if present,
|
||||
will indicate with a value of "jy" whether the reading is
|
||||
approved for a "Jouyou kanji".
|
||||
--><!ATTLIST reading r_status CDATA #IMPLIED>
|
||||
<!--
|
||||
See under ja_on and ja_kun above.
|
||||
--><!ELEMENT meaning (#PCDATA)>
|
||||
<!--
|
||||
The meaning associated with the kanji.
|
||||
--><!ATTLIST meaning m_lang CDATA #IMPLIED>
|
||||
<!--
|
||||
The m_lang attribute defines the target language of the meaning. It
|
||||
will be coded using the two-letter language code from the ISO 639
|
||||
standard. When absent, the value "en" (i.e. English) is implied. [{}]
|
||||
-->]>
|
||||
<kanjidic2>
|
||||
</kanjidic2>
|
5
result/intsubset2.xml.rde
Normal file
5
result/intsubset2.xml.rde
Normal file
@ -0,0 +1,5 @@
|
||||
0 10 kanjidic2 0 0
|
||||
0 1 kanjidic2 0 0
|
||||
1 14 #text 0 1
|
||||
|
||||
0 15 kanjidic2 0 0
|
5
result/intsubset2.xml.rdr
Normal file
5
result/intsubset2.xml.rdr
Normal file
@ -0,0 +1,5 @@
|
||||
0 10 kanjidic2 0 0
|
||||
0 1 kanjidic2 0 0
|
||||
1 14 #text 0 1
|
||||
|
||||
0 15 kanjidic2 0 0
|
286
result/intsubset2.xml.sax
Normal file
286
result/intsubset2.xml.sax
Normal file
@ -0,0 +1,286 @@
|
||||
SAX.setDocumentLocator()
|
||||
SAX.startDocument()
|
||||
SAX.internalSubset(kanjidic2, , )
|
||||
SAX.comment( Version 1.3
|
||||
This is the DTD of the XML-format kanji file combining information from
|
||||
the KANJIDIC and KANJD212 files. It is intended to be largely self-
|
||||
documenting, with each field being accompanied by an explanatory
|
||||
comment.
|
||||
|
||||
The file covers the following kanji:
|
||||
(a) the 6,355 kanji from JIS X 0208;
|
||||
(b) the 5,801 kanji from JIS X 0212;
|
||||
(c) the 3,625 kanji from JIS X 0213 as follows:
|
||||
(i) the 2,741 kanji which are also in JIS X 0212 have
|
||||
JIS X 0213 code-points (kuten) added to the existing entry;
|
||||
(ii) the 884 "new" kanji have new entries.
|
||||
|
||||
At the end of the explanation for a number of fields there is a tag
|
||||
with the format [N]. This indicates the leading letter(s) of the
|
||||
equivalent field in the KANJIDIC and KANJD212 files.
|
||||
|
||||
The KANJIDIC documentation should also be read for additional
|
||||
information about the information in the file.
|
||||
)
|
||||
SAX.elementDecl(kanjidic2, 4, ...)
|
||||
SAX.elementDecl(header, 4, ...)
|
||||
SAX.comment(
|
||||
The single header element will contain identification information
|
||||
about the version of the file
|
||||
)
|
||||
SAX.elementDecl(file_version, 3, ...)
|
||||
SAX.comment(
|
||||
This field denotes the version of kanjidic2 structure, as more
|
||||
than one version may exist.
|
||||
)
|
||||
SAX.elementDecl(database_version, 3, ...)
|
||||
SAX.comment(
|
||||
The version of the file, in the format YYYY-NN, where NN will be
|
||||
a number starting with 01 for the first version released in a
|
||||
calendar year, then increasing for each version in that year.
|
||||
)
|
||||
SAX.elementDecl(date_of_creation, 3, ...)
|
||||
SAX.comment(
|
||||
The date the file was created in international format (YYYY-MM-DD).
|
||||
)
|
||||
SAX.elementDecl(character, 4, ...)
|
||||
SAX.elementDecl(literal, 3, ...)
|
||||
SAX.comment(
|
||||
The character itself in UTF8 coding.
|
||||
)
|
||||
SAX.elementDecl(codepoint, 4, ...)
|
||||
SAX.comment(
|
||||
The codepoint element states the code of the character in the various
|
||||
character set standards.
|
||||
)
|
||||
SAX.elementDecl(cp_value, 3, ...)
|
||||
SAX.comment(
|
||||
The cp_value contains the codepoint of the character in a particular
|
||||
standard. The standard will be identified in the cp_type attribute.
|
||||
)
|
||||
SAX.attributeDecl(cp_value, cp_type, 1, 2, NULL, ...)
|
||||
SAX.comment(
|
||||
The cp_type attribute states the coding standard applying to the
|
||||
element. The values assigned so far are:
|
||||
jis208 - JIS X 0208-1997 - kuten coding (nn-nn)
|
||||
jis212 - JIS X 0212-1990 - kuten coding (nn-nn)
|
||||
jis213 - JIS X 0213-2000 - kuten coding (p-nn-nn)
|
||||
ucs - Unicode 4.0 - hex coding (4 or 5 hexadecimal digits)
|
||||
)
|
||||
SAX.elementDecl(radical, 4, ...)
|
||||
SAX.elementDecl(rad_value, 3, ...)
|
||||
SAX.comment(
|
||||
The radical number, in the range 1 to 214. The particular
|
||||
classification type is stated in the rad_type attribute.
|
||||
)
|
||||
SAX.attributeDecl(rad_value, rad_type, 1, 2, NULL, ...)
|
||||
SAX.comment(
|
||||
The rad_type attribute states the type of radical classification.
|
||||
classical - as recorded in the KangXi Zidian.
|
||||
nelson - as used in the Nelson "Modern Japanese-English
|
||||
Character Dictionary" (i.e. the Classic, not the New Nelson).
|
||||
This will only be used where Nelson reclassified the kanji.
|
||||
)
|
||||
SAX.elementDecl(misc, 4, ...)
|
||||
SAX.elementDecl(grade, 3, ...)
|
||||
SAX.comment(
|
||||
The Jouyou Kanji grade level. 1 through 6 indicate the grade in which
|
||||
the kanji is taught in Japanese schools. 8 indicates it is one of the
|
||||
remaining Jouyou Kanji to be learned in junior high school, and 9
|
||||
indicates it is a Jinmeiyou (for use in names) kanji. [G]
|
||||
)
|
||||
SAX.elementDecl(stroke_count, 3, ...)
|
||||
SAX.comment(
|
||||
The stroke count of the kanji, including the radical. If more than
|
||||
one, the first is considered the accepted count, while subsequent ones
|
||||
are common miscounts. (See Appendix E. of the KANJIDIC documentation
|
||||
for some of the rules applied when counting strokes in some of the
|
||||
radicals.) [S]
|
||||
)
|
||||
SAX.elementDecl(variant, 3, ...)
|
||||
SAX.comment(
|
||||
A cross-reference code to another kanji, usually regarded as a variant.
|
||||
The type of cross-reference is given in the var_type attribute.
|
||||
)
|
||||
SAX.attributeDecl(variant, var_type, 1, 2, NULL, ...)
|
||||
SAX.comment(
|
||||
The var_type attribute indicates the type of variant code. The current
|
||||
values are:
|
||||
jis208 - in JIS X 0208 - kuten coding
|
||||
jis212 - in JIS X 0212 - kuten coding
|
||||
jis213 - in JIS X 0213 - kuten coding
|
||||
deroo - De Roo number - numeric
|
||||
njecd - Halpern NJECD index number - numeric
|
||||
s_h - The Kanji Dictionary (Spahn & Hadamitzky) - descriptor
|
||||
nelson - "Classic" Nelson - numeric
|
||||
oneill - Japanese Names (O'Neill) - numeric
|
||||
)
|
||||
SAX.elementDecl(freq, 3, ...)
|
||||
SAX.comment(
|
||||
A frequency-of-use ranking. The 2,500 most-used characters have a
|
||||
ranking; those characters that lack this field are not ranked. The
|
||||
frequency is a number from 1 to 2,500 that expresses the relative
|
||||
frequency of occurrence of a character in modern Japanese. This is
|
||||
based on a survey in newspapers, so it is biassed towards kanji
|
||||
used in newspaper articles. The discrimination between the less
|
||||
frequently used kanji is not strong.
|
||||
)
|
||||
SAX.elementDecl(rad_name, 3, ...)
|
||||
SAX.comment(
|
||||
When the kanji is itself a radical and has a name, this element
|
||||
contains the name (in hiragana.) [T2]
|
||||
)
|
||||
SAX.elementDecl(dic_number, 4, ...)
|
||||
SAX.comment(
|
||||
This element contains the index numbers and similar unstructured
|
||||
information such as page numbers in a number of published dictionaries,
|
||||
and instructional books on kanji.
|
||||
)
|
||||
SAX.elementDecl(dic_ref, 3, ...)
|
||||
SAX.comment(
|
||||
Each dic_ref contains an index number. The particular dictionary,
|
||||
etc. is defined by the dr_type attribute.
|
||||
)
|
||||
SAX.attributeDecl(dic_ref, dr_type, 1, 2, NULL, ...)
|
||||
SAX.comment(
|
||||
The dr_type defines the dictionary or reference book, etc. to which
|
||||
dic_ref element applies. The initial allocation is:
|
||||
nelson_c - "Modern Reader's Japanese-English Character Dictionary",
|
||||
edited by Andrew Nelson (now published as the "Classic"
|
||||
Nelson).
|
||||
nelson_n - "The New Nelson Japanese-English Character Dictionary",
|
||||
edited by John Haig.
|
||||
halpern_njecd - "New Japanese-English Character Dictionary",
|
||||
edited by Jack Halpern.
|
||||
halpern_kkld - "Kanji Learners Dictionary" (Kodansha) edited by
|
||||
Jack Halpern.
|
||||
heisig - "Remembering The Kanji" by James Heisig.
|
||||
gakken - "A New Dictionary of Kanji Usage" (Gakken)
|
||||
oneill_names - "Japanese Names", by P.G. O'Neill.
|
||||
oneill_kk - "Essential Kanji" by P.G. O'Neill.
|
||||
moro - "Daikanwajiten" compiled by Morohashi. For some kanji two
|
||||
additional attributes are used: m_vol: the volume of the
|
||||
dictionary in which the kanji is found, and m_page: the page
|
||||
number in the volume.
|
||||
henshall - "A Guide To Remembering Japanese Characters" by
|
||||
Kenneth G. Henshall.
|
||||
sh_kk - "Kanji and Kana" by Spahn and Hadamitzky.
|
||||
sakade - "A Guide To Reading and Writing Japanese" edited by
|
||||
Florence Sakade.
|
||||
henshall3 - "A Guide To Reading and Writing Japanese" 3rd
|
||||
edition, edited by Henshall, Seeley and De Groot.
|
||||
tutt_cards - Tuttle Kanji Cards, compiled by Alexander Kask.
|
||||
crowley - "The Kanji Way to Japanese Language Power" by
|
||||
Dale Crowley.
|
||||
kanji_in_context - "Kanji in Context" by Nishiguchi and Kono.
|
||||
busy_people - "Japanese For Busy People" vols I-III, published
|
||||
by the AJLT. The codes are the volume.chapter.
|
||||
kodansha_compact - the "Kodansha Compact Kanji Guide".
|
||||
)
|
||||
SAX.attributeDecl(dic_ref, m_vol, 1, 3, NULL, ...)
|
||||
SAX.comment(
|
||||
See above under "moro".
|
||||
)
|
||||
SAX.attributeDecl(dic_ref, m_page, 1, 3, NULL, ...)
|
||||
SAX.comment(
|
||||
See above under "moro".
|
||||
)
|
||||
SAX.elementDecl(query_code, 4, ...)
|
||||
SAX.comment(
|
||||
These codes contain information relating to the glyph, and can be used
|
||||
for finding a required kanji. The type of code is defined by the
|
||||
qc_type attribute.
|
||||
)
|
||||
SAX.elementDecl(q_code, 3, ...)
|
||||
SAX.comment(
|
||||
The q_code contains the actual query-code value, according to the
|
||||
qc_type attribute.
|
||||
)
|
||||
SAX.attributeDecl(q_code, qc_type, 1, 2, NULL, ...)
|
||||
SAX.comment(
|
||||
The q_code attribute defines the type of query code. The current values
|
||||
are:
|
||||
skip - Halpern's SKIP (System of Kanji Indexing by Patterns)
|
||||
code. The format is n-nn-nn. See the KANJIDIC documentation
|
||||
for a description of the code and restrictions on the
|
||||
commercial use of this data. [P]
|
||||
|
||||
sh_desc - the descriptor codes for The Kanji Dictionary (Tuttle
|
||||
1996) by Spahn and Hadamitzky. They are in the form nxnn.n,
|
||||
e.g. 3k11.2, where the kanji has 3 strokes in the
|
||||
identifying radical, it is radical "k" in the SH
|
||||
classification system, there are 11 other strokes, and it is
|
||||
the 2nd kanji in the 3k11 sequence. (I am very grateful to
|
||||
Mark Spahn for providing the list of these descriptor codes
|
||||
for the kanji in this file.) [I]
|
||||
four_corner - the "Four Corner" code for the kanji. This is a code
|
||||
invented by Wang Chen in 1928. See the KANJIDIC documentation
|
||||
for an overview of the Four Corner System. [Q]
|
||||
|
||||
deroo - the codes developed by the late Father Joseph De Roo, and
|
||||
published in his book "2001 Kanji" (Bojinsha). Fr De Roo
|
||||
gave his permission for these codes to be included. [DR]
|
||||
misclass - a possible misclassification of the kanji according
|
||||
to one of the code types. (See the "Z" codes in the KANJIDIC
|
||||
documentation for more details.)
|
||||
|
||||
)
|
||||
SAX.elementDecl(reading_meaning, 4, ...)
|
||||
SAX.comment(
|
||||
The readings for the kanji in several languages, and the meanings, also
|
||||
in several languages. The readings and meanings are grouped to enable
|
||||
the handling of the situation where the meaning is differentiated by
|
||||
reading. [T1]
|
||||
)
|
||||
SAX.elementDecl(nanori, 3, ...)
|
||||
SAX.comment(
|
||||
Japanese readings that are now only associated with names.
|
||||
)
|
||||
SAX.elementDecl(rmgroup, 4, ...)
|
||||
SAX.elementDecl(reading, 3, ...)
|
||||
SAX.comment(
|
||||
The reading element contains the reading or pronunciation
|
||||
of the kanji.
|
||||
)
|
||||
SAX.attributeDecl(reading, r_type, 1, 2, NULL, ...)
|
||||
SAX.comment(
|
||||
The r_type attribute defines the type of reading in the reading
|
||||
element. The current values are:
|
||||
pinyin - the modern PinYin romanization of the Chinese reading
|
||||
of the kanji. The tones are represented by a concluding
|
||||
digit. [Y]
|
||||
korean_r - the romanized form of the Korean reading(s) of the
|
||||
kanji. The readings are in the (Republic of Korea) Ministry
|
||||
of Education style of romanization. [W]
|
||||
korean_h - the Korean reading(s) of the kanji in hangul.
|
||||
ja_on - the "on" Japanese reading of the kanji, in katakana. A
|
||||
second attribute r_status, if present, will indicate with
|
||||
a value of "jy" whether the reading is approved for a
|
||||
"Jouyou kanji".
|
||||
ja_kun - the "kun" Japanese reading of the kanji, in hiragana.
|
||||
Where relevant the okurigana is also included separated by a
|
||||
".". Readings associated with prefixes and suffixes are
|
||||
marked with a "-". A second attribute r_status, if present,
|
||||
will indicate with a value of "jy" whether the reading is
|
||||
approved for a "Jouyou kanji".
|
||||
)
|
||||
SAX.attributeDecl(reading, r_status, 1, 3, NULL, ...)
|
||||
SAX.comment(
|
||||
See under ja_on and ja_kun above.
|
||||
)
|
||||
SAX.elementDecl(meaning, 3, ...)
|
||||
SAX.comment(
|
||||
The meaning associated with the kanji.
|
||||
)
|
||||
SAX.attributeDecl(meaning, m_lang, 1, 3, NULL, ...)
|
||||
SAX.comment(
|
||||
The m_lang attribute defines the target language of the meaning. It
|
||||
will be coded using the two-letter language code from the ISO 639
|
||||
standard. When absent, the value "en" (i.e. English) is implied. [{}]
|
||||
)
|
||||
SAX.externalSubset(kanjidic2, , )
|
||||
SAX.startElement(kanjidic2)
|
||||
SAX.characters(
|
||||
, 1)
|
||||
SAX.endElement(kanjidic2)
|
||||
SAX.endDocument()
|
250
result/noent/intsubset2.xml
Normal file
250
result/noent/intsubset2.xml
Normal file
@ -0,0 +1,250 @@
|
||||
<?xml version="1.0"?>
|
||||
<!DOCTYPE kanjidic2 [
|
||||
<!-- Version 1.3
|
||||
This is the DTD of the XML-format kanji file combining information from
|
||||
the KANJIDIC and KANJD212 files. It is intended to be largely self-
|
||||
documenting, with each field being accompanied by an explanatory
|
||||
comment.
|
||||
|
||||
The file covers the following kanji:
|
||||
(a) the 6,355 kanji from JIS X 0208;
|
||||
(b) the 5,801 kanji from JIS X 0212;
|
||||
(c) the 3,625 kanji from JIS X 0213 as follows:
|
||||
(i) the 2,741 kanji which are also in JIS X 0212 have
|
||||
JIS X 0213 code-points (kuten) added to the existing entry;
|
||||
(ii) the 884 "new" kanji have new entries.
|
||||
|
||||
At the end of the explanation for a number of fields there is a tag
|
||||
with the format [N]. This indicates the leading letter(s) of the
|
||||
equivalent field in the KANJIDIC and KANJD212 files.
|
||||
|
||||
The KANJIDIC documentation should also be read for additional
|
||||
information about the information in the file.
|
||||
--><!ELEMENT kanjidic2 (header , character*)>
|
||||
<!ELEMENT header (file_version , database_version , date_of_creation)>
|
||||
<!--
|
||||
The single header element will contain identification information
|
||||
about the version of the file
|
||||
--><!ELEMENT file_version (#PCDATA)>
|
||||
<!--
|
||||
This field denotes the version of kanjidic2 structure, as more
|
||||
than one version may exist.
|
||||
--><!ELEMENT database_version (#PCDATA)>
|
||||
<!--
|
||||
The version of the file, in the format YYYY-NN, where NN will be
|
||||
a number starting with 01 for the first version released in a
|
||||
calendar year, then increasing for each version in that year.
|
||||
--><!ELEMENT date_of_creation (#PCDATA)>
|
||||
<!--
|
||||
The date the file was created in international format (YYYY-MM-DD).
|
||||
--><!ELEMENT character (literal , codepoint , radical , misc , dic_number? , query_code? , reading_meaning? , nanori?)*>
|
||||
<!ELEMENT literal (#PCDATA)>
|
||||
<!--
|
||||
The character itself in UTF8 coding.
|
||||
--><!ELEMENT codepoint (cp_value)+>
|
||||
<!--
|
||||
The codepoint element states the code of the character in the various
|
||||
character set standards.
|
||||
--><!ELEMENT cp_value (#PCDATA)>
|
||||
<!--
|
||||
The cp_value contains the codepoint of the character in a particular
|
||||
standard. The standard will be identified in the cp_type attribute.
|
||||
--><!ATTLIST cp_value cp_type CDATA #REQUIRED>
|
||||
<!--
|
||||
The cp_type attribute states the coding standard applying to the
|
||||
element. The values assigned so far are:
|
||||
jis208 - JIS X 0208-1997 - kuten coding (nn-nn)
|
||||
jis212 - JIS X 0212-1990 - kuten coding (nn-nn)
|
||||
jis213 - JIS X 0213-2000 - kuten coding (p-nn-nn)
|
||||
ucs - Unicode 4.0 - hex coding (4 or 5 hexadecimal digits)
|
||||
--><!ELEMENT radical (rad_value)+>
|
||||
<!ELEMENT rad_value (#PCDATA)>
|
||||
<!--
|
||||
The radical number, in the range 1 to 214. The particular
|
||||
classification type is stated in the rad_type attribute.
|
||||
--><!ATTLIST rad_value rad_type CDATA #REQUIRED>
|
||||
<!--
|
||||
The rad_type attribute states the type of radical classification.
|
||||
classical - as recorded in the KangXi Zidian.
|
||||
nelson - as used in the Nelson "Modern Japanese-English
|
||||
Character Dictionary" (i.e. the Classic, not the New Nelson).
|
||||
This will only be used where Nelson reclassified the kanji.
|
||||
--><!ELEMENT misc (grade? , stroke_count+ , variant* , freq* , rad_name*)>
|
||||
<!ELEMENT grade (#PCDATA)>
|
||||
<!--
|
||||
The Jouyou Kanji grade level. 1 through 6 indicate the grade in which
|
||||
the kanji is taught in Japanese schools. 8 indicates it is one of the
|
||||
remaining Jouyou Kanji to be learned in junior high school, and 9
|
||||
indicates it is a Jinmeiyou (for use in names) kanji. [G]
|
||||
--><!ELEMENT stroke_count (#PCDATA)>
|
||||
<!--
|
||||
The stroke count of the kanji, including the radical. If more than
|
||||
one, the first is considered the accepted count, while subsequent ones
|
||||
are common miscounts. (See Appendix E. of the KANJIDIC documentation
|
||||
for some of the rules applied when counting strokes in some of the
|
||||
radicals.) [S]
|
||||
--><!ELEMENT variant (#PCDATA)>
|
||||
<!--
|
||||
A cross-reference code to another kanji, usually regarded as a variant.
|
||||
The type of cross-reference is given in the var_type attribute.
|
||||
--><!ATTLIST variant var_type CDATA #REQUIRED>
|
||||
<!--
|
||||
The var_type attribute indicates the type of variant code. The current
|
||||
values are:
|
||||
jis208 - in JIS X 0208 - kuten coding
|
||||
jis212 - in JIS X 0212 - kuten coding
|
||||
jis213 - in JIS X 0213 - kuten coding
|
||||
deroo - De Roo number - numeric
|
||||
njecd - Halpern NJECD index number - numeric
|
||||
s_h - The Kanji Dictionary (Spahn & Hadamitzky) - descriptor
|
||||
nelson - "Classic" Nelson - numeric
|
||||
oneill - Japanese Names (O'Neill) - numeric
|
||||
--><!ELEMENT freq (#PCDATA)>
|
||||
<!--
|
||||
A frequency-of-use ranking. The 2,500 most-used characters have a
|
||||
ranking; those characters that lack this field are not ranked. The
|
||||
frequency is a number from 1 to 2,500 that expresses the relative
|
||||
frequency of occurrence of a character in modern Japanese. This is
|
||||
based on a survey in newspapers, so it is biassed towards kanji
|
||||
used in newspaper articles. The discrimination between the less
|
||||
frequently used kanji is not strong.
|
||||
--><!ELEMENT rad_name (#PCDATA)>
|
||||
<!--
|
||||
When the kanji is itself a radical and has a name, this element
|
||||
contains the name (in hiragana.) [T2]
|
||||
--><!ELEMENT dic_number (dic_ref)+>
|
||||
<!--
|
||||
This element contains the index numbers and similar unstructured
|
||||
information such as page numbers in a number of published dictionaries,
|
||||
and instructional books on kanji.
|
||||
--><!ELEMENT dic_ref (#PCDATA)>
|
||||
<!--
|
||||
Each dic_ref contains an index number. The particular dictionary,
|
||||
etc. is defined by the dr_type attribute.
|
||||
--><!ATTLIST dic_ref dr_type CDATA #REQUIRED>
|
||||
<!--
|
||||
The dr_type defines the dictionary or reference book, etc. to which
|
||||
dic_ref element applies. The initial allocation is:
|
||||
nelson_c - "Modern Reader's Japanese-English Character Dictionary",
|
||||
edited by Andrew Nelson (now published as the "Classic"
|
||||
Nelson).
|
||||
nelson_n - "The New Nelson Japanese-English Character Dictionary",
|
||||
edited by John Haig.
|
||||
halpern_njecd - "New Japanese-English Character Dictionary",
|
||||
edited by Jack Halpern.
|
||||
halpern_kkld - "Kanji Learners Dictionary" (Kodansha) edited by
|
||||
Jack Halpern.
|
||||
heisig - "Remembering The Kanji" by James Heisig.
|
||||
gakken - "A New Dictionary of Kanji Usage" (Gakken)
|
||||
oneill_names - "Japanese Names", by P.G. O'Neill.
|
||||
oneill_kk - "Essential Kanji" by P.G. O'Neill.
|
||||
moro - "Daikanwajiten" compiled by Morohashi. For some kanji two
|
||||
additional attributes are used: m_vol: the volume of the
|
||||
dictionary in which the kanji is found, and m_page: the page
|
||||
number in the volume.
|
||||
henshall - "A Guide To Remembering Japanese Characters" by
|
||||
Kenneth G. Henshall.
|
||||
sh_kk - "Kanji and Kana" by Spahn and Hadamitzky.
|
||||
sakade - "A Guide To Reading and Writing Japanese" edited by
|
||||
Florence Sakade.
|
||||
henshall3 - "A Guide To Reading and Writing Japanese" 3rd
|
||||
edition, edited by Henshall, Seeley and De Groot.
|
||||
tutt_cards - Tuttle Kanji Cards, compiled by Alexander Kask.
|
||||
crowley - "The Kanji Way to Japanese Language Power" by
|
||||
Dale Crowley.
|
||||
kanji_in_context - "Kanji in Context" by Nishiguchi and Kono.
|
||||
busy_people - "Japanese For Busy People" vols I-III, published
|
||||
by the AJLT. The codes are the volume.chapter.
|
||||
kodansha_compact - the "Kodansha Compact Kanji Guide".
|
||||
--><!ATTLIST dic_ref m_vol CDATA #IMPLIED>
|
||||
<!--
|
||||
See above under "moro".
|
||||
--><!ATTLIST dic_ref m_page CDATA #IMPLIED>
|
||||
<!--
|
||||
See above under "moro".
|
||||
--><!ELEMENT query_code (q_code)+>
|
||||
<!--
|
||||
These codes contain information relating to the glyph, and can be used
|
||||
for finding a required kanji. The type of code is defined by the
|
||||
qc_type attribute.
|
||||
--><!ELEMENT q_code (#PCDATA)>
|
||||
<!--
|
||||
The q_code contains the actual query-code value, according to the
|
||||
qc_type attribute.
|
||||
--><!ATTLIST q_code qc_type CDATA #REQUIRED>
|
||||
<!--
|
||||
The q_code attribute defines the type of query code. The current values
|
||||
are:
|
||||
skip - Halpern's SKIP (System of Kanji Indexing by Patterns)
|
||||
code. The format is n-nn-nn. See the KANJIDIC documentation
|
||||
for a description of the code and restrictions on the
|
||||
commercial use of this data. [P]
|
||||
|
||||
sh_desc - the descriptor codes for The Kanji Dictionary (Tuttle
|
||||
1996) by Spahn and Hadamitzky. They are in the form nxnn.n,
|
||||
e.g. 3k11.2, where the kanji has 3 strokes in the
|
||||
identifying radical, it is radical "k" in the SH
|
||||
classification system, there are 11 other strokes, and it is
|
||||
the 2nd kanji in the 3k11 sequence. (I am very grateful to
|
||||
Mark Spahn for providing the list of these descriptor codes
|
||||
for the kanji in this file.) [I]
|
||||
four_corner - the "Four Corner" code for the kanji. This is a code
|
||||
invented by Wang Chen in 1928. See the KANJIDIC documentation
|
||||
for an overview of the Four Corner System. [Q]
|
||||
|
||||
deroo - the codes developed by the late Father Joseph De Roo, and
|
||||
published in his book "2001 Kanji" (Bojinsha). Fr De Roo
|
||||
gave his permission for these codes to be included. [DR]
|
||||
misclass - a possible misclassification of the kanji according
|
||||
to one of the code types. (See the "Z" codes in the KANJIDIC
|
||||
documentation for more details.)
|
||||
|
||||
--><!ELEMENT reading_meaning (rmgroup* , nanori*)>
|
||||
<!--
|
||||
The readings for the kanji in several languages, and the meanings, also
|
||||
in several languages. The readings and meanings are grouped to enable
|
||||
the handling of the situation where the meaning is differentiated by
|
||||
reading. [T1]
|
||||
--><!ELEMENT nanori (#PCDATA)>
|
||||
<!--
|
||||
Japanese readings that are now only associated with names.
|
||||
--><!ELEMENT rmgroup (reading* , meaning*)>
|
||||
<!ELEMENT reading (#PCDATA)>
|
||||
<!--
|
||||
The reading element contains the reading or pronunciation
|
||||
of the kanji.
|
||||
--><!ATTLIST reading r_type CDATA #REQUIRED>
|
||||
<!--
|
||||
The r_type attribute defines the type of reading in the reading
|
||||
element. The current values are:
|
||||
pinyin - the modern PinYin romanization of the Chinese reading
|
||||
of the kanji. The tones are represented by a concluding
|
||||
digit. [Y]
|
||||
korean_r - the romanized form of the Korean reading(s) of the
|
||||
kanji. The readings are in the (Republic of Korea) Ministry
|
||||
of Education style of romanization. [W]
|
||||
korean_h - the Korean reading(s) of the kanji in hangul.
|
||||
ja_on - the "on" Japanese reading of the kanji, in katakana. A
|
||||
second attribute r_status, if present, will indicate with
|
||||
a value of "jy" whether the reading is approved for a
|
||||
"Jouyou kanji".
|
||||
ja_kun - the "kun" Japanese reading of the kanji, in hiragana.
|
||||
Where relevant the okurigana is also included separated by a
|
||||
".". Readings associated with prefixes and suffixes are
|
||||
marked with a "-". A second attribute r_status, if present,
|
||||
will indicate with a value of "jy" whether the reading is
|
||||
approved for a "Jouyou kanji".
|
||||
--><!ATTLIST reading r_status CDATA #IMPLIED>
|
||||
<!--
|
||||
See under ja_on and ja_kun above.
|
||||
--><!ELEMENT meaning (#PCDATA)>
|
||||
<!--
|
||||
The meaning associated with the kanji.
|
||||
--><!ATTLIST meaning m_lang CDATA #IMPLIED>
|
||||
<!--
|
||||
The m_lang attribute defines the target language of the meaning. It
|
||||
will be coded using the two-letter language code from the ISO 639
|
||||
standard. When absent, the value "en" (i.e. English) is implied. [{}]
|
||||
-->]>
|
||||
<kanjidic2>
|
||||
</kanjidic2>
|
282
test/intsubset2.xml
Normal file
282
test/intsubset2.xml
Normal file
@ -0,0 +1,282 @@
|
||||
<?xml version="1.0"?>
|
||||
<!DOCTYPE kanjidic2 [
|
||||
<!-- Version 1.3
|
||||
This is the DTD of the XML-format kanji file combining information from
|
||||
the KANJIDIC and KANJD212 files. It is intended to be largely self-
|
||||
documenting, with each field being accompanied by an explanatory
|
||||
comment.
|
||||
|
||||
The file covers the following kanji:
|
||||
(a) the 6,355 kanji from JIS X 0208;
|
||||
(b) the 5,801 kanji from JIS X 0212;
|
||||
(c) the 3,625 kanji from JIS X 0213 as follows:
|
||||
(i) the 2,741 kanji which are also in JIS X 0212 have
|
||||
JIS X 0213 code-points (kuten) added to the existing entry;
|
||||
(ii) the 884 "new" kanji have new entries.
|
||||
|
||||
At the end of the explanation for a number of fields there is a tag
|
||||
with the format [N]. This indicates the leading letter(s) of the
|
||||
equivalent field in the KANJIDIC and KANJD212 files.
|
||||
|
||||
The KANJIDIC documentation should also be read for additional
|
||||
information about the information in the file.
|
||||
-->
|
||||
<!ELEMENT kanjidic2 (header,character*)>
|
||||
<!ELEMENT header (file_version,database_version,date_of_creation)>
|
||||
<!--
|
||||
The single header element will contain identification information
|
||||
about the version of the file
|
||||
-->
|
||||
<!ELEMENT file_version (#PCDATA)>
|
||||
<!--
|
||||
This field denotes the version of kanjidic2 structure, as more
|
||||
than one version may exist.
|
||||
-->
|
||||
<!ELEMENT database_version (#PCDATA)>
|
||||
<!--
|
||||
The version of the file, in the format YYYY-NN, where NN will be
|
||||
a number starting with 01 for the first version released in a
|
||||
calendar year, then increasing for each version in that year.
|
||||
-->
|
||||
<!ELEMENT date_of_creation (#PCDATA)>
|
||||
<!--
|
||||
The date the file was created in international format (YYYY-MM-DD).
|
||||
-->
|
||||
<!ELEMENT character (literal,codepoint, radical, misc, dic_number?, query_code?, reading_meaning?,nanori?)*>
|
||||
<!ELEMENT literal (#PCDATA)>
|
||||
<!--
|
||||
The character itself in UTF8 coding.
|
||||
-->
|
||||
<!ELEMENT codepoint (cp_value+)>
|
||||
<!--
|
||||
The codepoint element states the code of the character in the various
|
||||
character set standards.
|
||||
-->
|
||||
<!ELEMENT cp_value (#PCDATA)>
|
||||
<!--
|
||||
The cp_value contains the codepoint of the character in a particular
|
||||
standard. The standard will be identified in the cp_type attribute.
|
||||
-->
|
||||
<!ATTLIST cp_value cp_type CDATA #REQUIRED>
|
||||
<!--
|
||||
The cp_type attribute states the coding standard applying to the
|
||||
element. The values assigned so far are:
|
||||
jis208 - JIS X 0208-1997 - kuten coding (nn-nn)
|
||||
jis212 - JIS X 0212-1990 - kuten coding (nn-nn)
|
||||
jis213 - JIS X 0213-2000 - kuten coding (p-nn-nn)
|
||||
ucs - Unicode 4.0 - hex coding (4 or 5 hexadecimal digits)
|
||||
-->
|
||||
<!ELEMENT radical (rad_value+)>
|
||||
<!ELEMENT rad_value (#PCDATA)>
|
||||
<!--
|
||||
The radical number, in the range 1 to 214. The particular
|
||||
classification type is stated in the rad_type attribute.
|
||||
-->
|
||||
<!ATTLIST rad_value rad_type CDATA #REQUIRED>
|
||||
<!--
|
||||
The rad_type attribute states the type of radical classification.
|
||||
classical - as recorded in the KangXi Zidian.
|
||||
nelson - as used in the Nelson "Modern Japanese-English
|
||||
Character Dictionary" (i.e. the Classic, not the New Nelson).
|
||||
This will only be used where Nelson reclassified the kanji.
|
||||
-->
|
||||
<!ELEMENT misc (grade?, stroke_count+, variant*, freq*, rad_name*)>
|
||||
<!ELEMENT grade (#PCDATA)>
|
||||
<!--
|
||||
The Jouyou Kanji grade level. 1 through 6 indicate the grade in which
|
||||
the kanji is taught in Japanese schools. 8 indicates it is one of the
|
||||
remaining Jouyou Kanji to be learned in junior high school, and 9
|
||||
indicates it is a Jinmeiyou (for use in names) kanji. [G]
|
||||
-->
|
||||
<!ELEMENT stroke_count (#PCDATA)>
|
||||
<!--
|
||||
The stroke count of the kanji, including the radical. If more than
|
||||
one, the first is considered the accepted count, while subsequent ones
|
||||
are common miscounts. (See Appendix E. of the KANJIDIC documentation
|
||||
for some of the rules applied when counting strokes in some of the
|
||||
radicals.) [S]
|
||||
-->
|
||||
<!ELEMENT variant (#PCDATA)>
|
||||
<!--
|
||||
A cross-reference code to another kanji, usually regarded as a variant.
|
||||
The type of cross-reference is given in the var_type attribute.
|
||||
-->
|
||||
<!ATTLIST variant var_type CDATA #REQUIRED>
|
||||
<!--
|
||||
The var_type attribute indicates the type of variant code. The current
|
||||
values are:
|
||||
jis208 - in JIS X 0208 - kuten coding
|
||||
jis212 - in JIS X 0212 - kuten coding
|
||||
jis213 - in JIS X 0213 - kuten coding
|
||||
deroo - De Roo number - numeric
|
||||
njecd - Halpern NJECD index number - numeric
|
||||
s_h - The Kanji Dictionary (Spahn & Hadamitzky) - descriptor
|
||||
nelson - "Classic" Nelson - numeric
|
||||
oneill - Japanese Names (O'Neill) - numeric
|
||||
-->
|
||||
<!ELEMENT freq (#PCDATA)>
|
||||
<!--
|
||||
A frequency-of-use ranking. The 2,500 most-used characters have a
|
||||
ranking; those characters that lack this field are not ranked. The
|
||||
frequency is a number from 1 to 2,500 that expresses the relative
|
||||
frequency of occurrence of a character in modern Japanese. This is
|
||||
based on a survey in newspapers, so it is biassed towards kanji
|
||||
used in newspaper articles. The discrimination between the less
|
||||
frequently used kanji is not strong.
|
||||
-->
|
||||
<!ELEMENT rad_name (#PCDATA)>
|
||||
<!--
|
||||
When the kanji is itself a radical and has a name, this element
|
||||
contains the name (in hiragana.) [T2]
|
||||
-->
|
||||
<!ELEMENT dic_number (dic_ref+)>
|
||||
<!--
|
||||
This element contains the index numbers and similar unstructured
|
||||
information such as page numbers in a number of published dictionaries,
|
||||
and instructional books on kanji.
|
||||
-->
|
||||
<!ELEMENT dic_ref (#PCDATA)>
|
||||
<!--
|
||||
Each dic_ref contains an index number. The particular dictionary,
|
||||
etc. is defined by the dr_type attribute.
|
||||
-->
|
||||
<!ATTLIST dic_ref dr_type CDATA #REQUIRED>
|
||||
<!--
|
||||
The dr_type defines the dictionary or reference book, etc. to which
|
||||
dic_ref element applies. The initial allocation is:
|
||||
nelson_c - "Modern Reader's Japanese-English Character Dictionary",
|
||||
edited by Andrew Nelson (now published as the "Classic"
|
||||
Nelson).
|
||||
nelson_n - "The New Nelson Japanese-English Character Dictionary",
|
||||
edited by John Haig.
|
||||
halpern_njecd - "New Japanese-English Character Dictionary",
|
||||
edited by Jack Halpern.
|
||||
halpern_kkld - "Kanji Learners Dictionary" (Kodansha) edited by
|
||||
Jack Halpern.
|
||||
heisig - "Remembering The Kanji" by James Heisig.
|
||||
gakken - "A New Dictionary of Kanji Usage" (Gakken)
|
||||
oneill_names - "Japanese Names", by P.G. O'Neill.
|
||||
oneill_kk - "Essential Kanji" by P.G. O'Neill.
|
||||
moro - "Daikanwajiten" compiled by Morohashi. For some kanji two
|
||||
additional attributes are used: m_vol: the volume of the
|
||||
dictionary in which the kanji is found, and m_page: the page
|
||||
number in the volume.
|
||||
henshall - "A Guide To Remembering Japanese Characters" by
|
||||
Kenneth G. Henshall.
|
||||
sh_kk - "Kanji and Kana" by Spahn and Hadamitzky.
|
||||
sakade - "A Guide To Reading and Writing Japanese" edited by
|
||||
Florence Sakade.
|
||||
henshall3 - "A Guide To Reading and Writing Japanese" 3rd
|
||||
edition, edited by Henshall, Seeley and De Groot.
|
||||
tutt_cards - Tuttle Kanji Cards, compiled by Alexander Kask.
|
||||
crowley - "The Kanji Way to Japanese Language Power" by
|
||||
Dale Crowley.
|
||||
kanji_in_context - "Kanji in Context" by Nishiguchi and Kono.
|
||||
busy_people - "Japanese For Busy People" vols I-III, published
|
||||
by the AJLT. The codes are the volume.chapter.
|
||||
kodansha_compact - the "Kodansha Compact Kanji Guide".
|
||||
-->
|
||||
<!ATTLIST dic_ref m_vol CDATA #IMPLIED>
|
||||
<!--
|
||||
See above under "moro".
|
||||
-->
|
||||
<!ATTLIST dic_ref m_page CDATA #IMPLIED>
|
||||
<!--
|
||||
See above under "moro".
|
||||
-->
|
||||
<!ELEMENT query_code (q_code+)>
|
||||
<!--
|
||||
These codes contain information relating to the glyph, and can be used
|
||||
for finding a required kanji. The type of code is defined by the
|
||||
qc_type attribute.
|
||||
-->
|
||||
<!ELEMENT q_code (#PCDATA)>
|
||||
<!--
|
||||
The q_code contains the actual query-code value, according to the
|
||||
qc_type attribute.
|
||||
-->
|
||||
<!ATTLIST q_code qc_type CDATA #REQUIRED>
|
||||
<!--
|
||||
The q_code attribute defines the type of query code. The current values
|
||||
are:
|
||||
skip - Halpern's SKIP (System of Kanji Indexing by Patterns)
|
||||
code. The format is n-nn-nn. See the KANJIDIC documentation
|
||||
for a description of the code and restrictions on the
|
||||
commercial use of this data. [P]
|
||||
|
||||
sh_desc - the descriptor codes for The Kanji Dictionary (Tuttle
|
||||
1996) by Spahn and Hadamitzky. They are in the form nxnn.n,
|
||||
e.g. 3k11.2, where the kanji has 3 strokes in the
|
||||
identifying radical, it is radical "k" in the SH
|
||||
classification system, there are 11 other strokes, and it is
|
||||
the 2nd kanji in the 3k11 sequence. (I am very grateful to
|
||||
Mark Spahn for providing the list of these descriptor codes
|
||||
for the kanji in this file.) [I]
|
||||
four_corner - the "Four Corner" code for the kanji. This is a code
|
||||
invented by Wang Chen in 1928. See the KANJIDIC documentation
|
||||
for an overview of the Four Corner System. [Q]
|
||||
|
||||
deroo - the codes developed by the late Father Joseph De Roo, and
|
||||
published in his book "2001 Kanji" (Bojinsha). Fr De Roo
|
||||
gave his permission for these codes to be included. [DR]
|
||||
misclass - a possible misclassification of the kanji according
|
||||
to one of the code types. (See the "Z" codes in the KANJIDIC
|
||||
documentation for more details.)
|
||||
|
||||
-->
|
||||
<!ELEMENT reading_meaning (rmgroup*, nanori*)>
|
||||
<!--
|
||||
The readings for the kanji in several languages, and the meanings, also
|
||||
in several languages. The readings and meanings are grouped to enable
|
||||
the handling of the situation where the meaning is differentiated by
|
||||
reading. [T1]
|
||||
-->
|
||||
<!ELEMENT nanori (#PCDATA)>
|
||||
<!--
|
||||
Japanese readings that are now only associated with names.
|
||||
-->
|
||||
<!ELEMENT rmgroup (reading*, meaning*)>
|
||||
<!ELEMENT reading (#PCDATA)>
|
||||
<!--
|
||||
The reading element contains the reading or pronunciation
|
||||
of the kanji.
|
||||
-->
|
||||
<!ATTLIST reading r_type CDATA #REQUIRED>
|
||||
<!--
|
||||
The r_type attribute defines the type of reading in the reading
|
||||
element. The current values are:
|
||||
pinyin - the modern PinYin romanization of the Chinese reading
|
||||
of the kanji. The tones are represented by a concluding
|
||||
digit. [Y]
|
||||
korean_r - the romanized form of the Korean reading(s) of the
|
||||
kanji. The readings are in the (Republic of Korea) Ministry
|
||||
of Education style of romanization. [W]
|
||||
korean_h - the Korean reading(s) of the kanji in hangul.
|
||||
ja_on - the "on" Japanese reading of the kanji, in katakana. A
|
||||
second attribute r_status, if present, will indicate with
|
||||
a value of "jy" whether the reading is approved for a
|
||||
"Jouyou kanji".
|
||||
ja_kun - the "kun" Japanese reading of the kanji, in hiragana.
|
||||
Where relevant the okurigana is also included separated by a
|
||||
".". Readings associated with prefixes and suffixes are
|
||||
marked with a "-". A second attribute r_status, if present,
|
||||
will indicate with a value of "jy" whether the reading is
|
||||
approved for a "Jouyou kanji".
|
||||
-->
|
||||
<!ATTLIST reading r_status CDATA #IMPLIED>
|
||||
<!--
|
||||
See under ja_on and ja_kun above.
|
||||
-->
|
||||
<!ELEMENT meaning (#PCDATA)>
|
||||
<!--
|
||||
The meaning associated with the kanji.
|
||||
-->
|
||||
<!ATTLIST meaning m_lang CDATA #IMPLIED>
|
||||
<!--
|
||||
The m_lang attribute defines the target language of the meaning. It
|
||||
will be coded using the two-letter language code from the ISO 639
|
||||
standard. When absent, the value "en" (i.e. English) is implied. [{}]
|
||||
-->
|
||||
] >
|
||||
<kanjidic2>
|
||||
</kanjidic2>
|
Loading…
Reference in New Issue
Block a user