From 74ca5511d7fd2ec72112c357e1964feff947343b Mon Sep 17 00:00:00 2001 From: Slava Monich Date: Wed, 27 May 2015 00:00:50 +0300 Subject: [PATCH] Initial import of linebreak --- linebreak/linebreak/AUTHORS | 8 + linebreak/linebreak/CVS/Entries | 32 + linebreak/linebreak/CVS/Repository | 1 + linebreak/linebreak/CVS/Root | 1 + linebreak/linebreak/ChangeLog | 512 +++++++ linebreak/linebreak/Doxyfile | 1219 +++++++++++++++ linebreak/linebreak/LICENCE | 19 + linebreak/linebreak/LineBreak1.sed | 1 + linebreak/linebreak/LineBreak2.sed | 2 + linebreak/linebreak/Makefile.am | 63 + linebreak/linebreak/Makefile.gcc | 177 +++ linebreak/linebreak/Makefile.msvc | 189 +++ linebreak/linebreak/NEWS | 49 + linebreak/linebreak/README | 88 ++ linebreak/linebreak/bootstrap | 6 + linebreak/linebreak/configure.ac | 12 + linebreak/linebreak/filter_dup.c | 47 + linebreak/linebreak/libunibreak.pc.in | 11 + linebreak/linebreak/linebreak.c | 737 +++++++++ linebreak/linebreak/linebreak.h | 87 ++ linebreak/linebreak/linebreakdata.c | 1868 +++++++++++++++++++++++ linebreak/linebreak/linebreakdata1.tmpl | 1 + linebreak/linebreak/linebreakdata2.tmpl | 7 + linebreak/linebreak/linebreakdata3.tmpl | 2 + linebreak/linebreak/linebreakdef.c | 139 ++ linebreak/linebreak/linebreakdef.h | 149 ++ linebreak/linebreak/purge | 2 + linebreak/linebreak/sort_numeric_hex.py | 6 + linebreak/linebreak/wordbreak.c | 437 ++++++ linebreak/linebreak/wordbreak.h | 72 + linebreak/linebreak/wordbreakdata.c | 860 +++++++++++ linebreak/linebreak/wordbreakdata1.tmpl | 5 + linebreak/linebreak/wordbreakdata2.tmpl | 2 + linebreak/linebreak/wordbreakdef.h | 78 + 34 files changed, 6889 insertions(+) create mode 100644 linebreak/linebreak/AUTHORS create mode 100644 linebreak/linebreak/CVS/Entries create mode 100644 linebreak/linebreak/CVS/Repository create mode 100644 linebreak/linebreak/CVS/Root create mode 100644 linebreak/linebreak/ChangeLog create mode 100644 linebreak/linebreak/Doxyfile create mode 100644 linebreak/linebreak/LICENCE create mode 100644 linebreak/linebreak/LineBreak1.sed create mode 100644 linebreak/linebreak/LineBreak2.sed create mode 100644 linebreak/linebreak/Makefile.am create mode 100644 linebreak/linebreak/Makefile.gcc create mode 100644 linebreak/linebreak/Makefile.msvc create mode 100644 linebreak/linebreak/NEWS create mode 100644 linebreak/linebreak/README create mode 100755 linebreak/linebreak/bootstrap create mode 100644 linebreak/linebreak/configure.ac create mode 100644 linebreak/linebreak/filter_dup.c create mode 100644 linebreak/linebreak/libunibreak.pc.in create mode 100644 linebreak/linebreak/linebreak.c create mode 100644 linebreak/linebreak/linebreak.h create mode 100644 linebreak/linebreak/linebreakdata.c create mode 100644 linebreak/linebreak/linebreakdata1.tmpl create mode 100644 linebreak/linebreak/linebreakdata2.tmpl create mode 100644 linebreak/linebreak/linebreakdata3.tmpl create mode 100644 linebreak/linebreak/linebreakdef.c create mode 100644 linebreak/linebreak/linebreakdef.h create mode 100755 linebreak/linebreak/purge create mode 100755 linebreak/linebreak/sort_numeric_hex.py create mode 100644 linebreak/linebreak/wordbreak.c create mode 100644 linebreak/linebreak/wordbreak.h create mode 100644 linebreak/linebreak/wordbreakdata.c create mode 100644 linebreak/linebreak/wordbreakdata1.tmpl create mode 100644 linebreak/linebreak/wordbreakdata2.tmpl create mode 100644 linebreak/linebreak/wordbreakdef.h diff --git a/linebreak/linebreak/AUTHORS b/linebreak/linebreak/AUTHORS new file mode 100644 index 0000000..22786d4 --- /dev/null +++ b/linebreak/linebreak/AUTHORS @@ -0,0 +1,8 @@ +Wu Yongwei. Designed and implemented liblinebreak. + +Nikolay Pultsin. Put forward the original requirements on liblinebreak, +performed tests, and made a lot of suggestions on the initial versions. + +Thomas Klausner. Autoconfiscated and libtoolized liblinebreak. + +Tom Hacohen. Added word boundaries support. diff --git a/linebreak/linebreak/CVS/Entries b/linebreak/linebreak/CVS/Entries new file mode 100644 index 0000000..c8adfa0 --- /dev/null +++ b/linebreak/linebreak/CVS/Entries @@ -0,0 +1,32 @@ +/AUTHORS/1.2/Wed Jan 18 14:26:13 2012// +/ChangeLog/1.78/Sat Aug 11 07:35:23 2012// +/Doxyfile/1.7/Sat Aug 11 06:55:18 2012// +/LICENCE/1.4/Sat Aug 11 07:35:23 2012// +/LineBreak1.sed/1.2/Sun Dec 7 10:54:37 2008// +/LineBreak2.sed/1.2/Sun Dec 7 10:54:37 2008// +/Makefile.am/1.8/Sat Aug 11 06:55:18 2012// +/Makefile.gcc/1.4/Thu Jan 19 14:03:34 2012// +/Makefile.msvc/1.5/Sat Aug 11 05:57:50 2012// +/NEWS/1.7/Sat Aug 11 06:55:18 2012// +/README/1.8/Sat Aug 11 06:55:18 2012// +/bootstrap/1.1/Fri Dec 12 12:01:39 2008// +/configure.ac/1.6/Sat Aug 11 06:55:18 2012// +/filter_dup.c/1.1/Sat Feb 23 11:53:28 2008// +/libunibreak.pc.in/1.1/Sat Aug 11 06:55:18 2012// +/linebreak.c/1.25/Sat May 7 19:55:10 2011// +/linebreak.h/1.14/Sat May 7 19:55:10 2011// +/linebreakdata.c/1.5/Sat May 7 19:40:20 2011// +/linebreakdata1.tmpl/1.1/Sat Feb 23 11:53:28 2008// +/linebreakdata2.tmpl/1.2/Sun Mar 2 07:30:43 2008// +/linebreakdata3.tmpl/1.1/Sat Feb 23 11:53:28 2008// +/linebreakdef.c/1.12/Sat May 7 19:55:10 2011// +/linebreakdef.h/1.12/Sat May 7 19:55:10 2011// +/purge/1.1/Fri Dec 12 12:01:39 2008// +/sort_numeric_hex.py/1.2/Wed Jan 18 14:26:13 2012// +/wordbreak.c/1.3/Sat Feb 4 14:32:57 2012// +/wordbreak.h/1.4/Sat Feb 4 14:32:58 2012// +/wordbreakdata.c/1.2/Wed Jan 18 14:26:13 2012// +/wordbreakdata1.tmpl/1.2/Wed Jan 18 14:26:13 2012// +/wordbreakdata2.tmpl/1.2/Wed Jan 18 14:26:13 2012// +/wordbreakdef.h/1.2/Wed Jan 18 14:26:13 2012// +D diff --git a/linebreak/linebreak/CVS/Repository b/linebreak/linebreak/CVS/Repository new file mode 100644 index 0000000..668df54 --- /dev/null +++ b/linebreak/linebreak/CVS/Repository @@ -0,0 +1 @@ +common/tools/linebreak diff --git a/linebreak/linebreak/CVS/Root b/linebreak/linebreak/CVS/Root new file mode 100644 index 0000000..3d25408 --- /dev/null +++ b/linebreak/linebreak/CVS/Root @@ -0,0 +1 @@ +:pserver:anonymous@vimgadgets.cvs.sourceforge.net:/cvsroot/vimgadgets diff --git a/linebreak/linebreak/ChangeLog b/linebreak/linebreak/ChangeLog new file mode 100644 index 0000000..20de2e2 --- /dev/null +++ b/linebreak/linebreak/ChangeLog @@ -0,0 +1,512 @@ +2012-08-11 Wu Yongwei + + * LICENCE: Add copyright information about Tom Hacohen. + +2012-08-11 Wu Yongwei + + * configure.ac (AC_INIT): Change the library name and version to + `libunibreak' and `1.0'. + (AC_PROG_LN_S): New macro. + (AC_OUTPUT): Change to `libunibreak.pc'. + * Doxyfile: (PROJECT_NAME): Change to `libunibreak'. + (PROJECT_NUMBER): Change to `1.0'. + * Makefile.am (lib_LTLIBRARIES): Change to `libunibreak.la'. + (pkgconfig_DATA): Change to `libunibreak.la'. + (libunibreak_la_LDFLAGS): Reset the version to `1:0'. + (install-exec-hook): Replace the static library liblinebreak.a with + a symlink to libunibreak.a. + * NEW: Add information about libunibreak 1.0. + * README: Change the library name, and add information about word + break. + +2012-08-11 Wu Yongwei + + * Makefile.msvc: Change the library name to `libunibreak', and the + output library to `unibreak.lib'. + +2012-02-04 Wu Yongwei + + * wordbreak.h (WORDBREAK_INSIDEACHAR): Change from + WORDBREAK_INSIDECHAR. + * wordbreak.c (set_brks_to): Change `WORDBREAK_INSIDECHAR' to + `WORDBREAK_INSIDEACHAR'. + +2012-01-19 Wu Yongwei + + * wordbreak.h: Change angle brackets to quotation marks (which + caused build errors). + +2012-01-19 Wu Yongwei + + * Makefile.gcc (CFILES): Add wordbreak.c. + (WordBreakProperty.txt): New target. + (wordbreakdata): New target. + +2012-01-19 Wu Yongwei + + * Makefile.am (liblinebreak_la_SOURCES): Remove wordbreakdata.c. + (EXTRA_DIST): Add wordbreakdata.c, wordbreakdata1.tmpl, and + wordbreakdata2.tmpl. + +2012-01-19 Wu Yongwei + + * Makefile.msvc: Add wordbreak files. + +2012-01-18 Tom Hacohen + + Add word breaking support. + * AUTHORS: Add `Tom Hacohen'. + * Makefile.am (include_HEADERS): Add header files for word breaking. + (liblinebreak_la_SOURCES): Add source files for word breaking. + (sort_numeric_hex.py): Add `sort_numeric_hex.py'. + (distclean-local): Clean also `WordBreakData.txt'. + (WordBreakProperty.txt): New target. + (wordbreakdata): New target. + * sort_numeric_hex.py: New file. + * wordbreak.c: New file. + * wordbreak.h: New file. + * wordbreakdef.h: New file. + * wordbreakdata.c: New file. + * wordbreakdata1.tmpl: New file. + * wordbreakdata2.tmpl: New file. + +2011-05-17 Wu Yongwei + + Add support for pkg-config (thanks to Tom Hacohen). + * liblinebreak.pc.in: New file. + * configure.ac (AC_OUTPUT): Add `liblinebreak.pc'. + * Makefile.am (pkgconfig_DATA): Set to `liblinebreak.pc'. + (pkgconfigdir): Set to `$(libdir)/pkgconfig'. + +2011-05-07 Wu Yongwei + + * README: Update the reference to UAX #14-26, for Unicode 6.0.0. + +2011-05-07 Wu Yongwei + + * configure.ac (AC_INIT): Increase the version to 2.1. + * Makefile.am (liblinebreak_la_LDFLAGS): Set the version-info to + `2:1'. + +2011-05-07 Wu Yongwei + + * LICENCE: Update the copyright year. + +2011-05-07 Wu Yongwei + + Update for the 2.1 release. + * Doxyfile (PROJECT_NUMBER): Set to `2.1'. + * NEWS: Add information about the 2.1 release. + * linebreak.h (LINEBREAK_VERSION): Set to `0x0201'. + * linebreak.h: Update comments. + * linebreak.c: Ditto. + * linebreakdef.h: Ditto. + * linebreakdef.c: Ditto. + +2011-05-07 Wu Yongwei + + * linebreakdata.c: Regenerate from LineBreak-6.0.0.txt. + +2011-05-07 Wu Yongwei + + * linebreak.c (set_linebreaks): Fix the assertion failure when + U+FFFC (OBJECT REPLACEMENT CHARACTER) appears at the beginning of a + line (thanks to Tom Hacohen). + +2010-01-03 Wu Yongwei + + * LICENCE: Update the copyright year. + +2010-01-03 Wu Yongwei + + * NEWS: Add information about the 2.0 release. + +2010-01-03 Wu Yongwei + + * Doxyfile (PROJECT_NUMBER): Set to `2.0'. + (HAVE_DOT): Set to `YES'. + +2010-01-03 Wu Yongwei + + * linebreak.c: Update the version number in comment to 2.0. + * linebreak.h: Ditto. + * linebreakdef.c: Ditto. + * linebreakdef.h: Ditto. + +2009-12-17 Wu Yongwei + + Change the values of enum BreakAction to the same length. + * linebreak.c (DIRECT_BRK): Rename to DIR_BRK. + (INDIRECT_BRK): Rename to IND_BRK. + (CM_INDIRECT_BRK): Rename to CMI_BRK. + (CM_PROHIBITED_BRK): Rename to CMP_BRK. + (PROHIBITED_BRK): Rename to PRH_BRK. + +2009-11-29 Wu Yongwei + + * Doxyfile (TAB_SIZE): Set to the correct size `4', as used in the + source files. + +2009-11-29 Wu Yongwei + + Update files according to UAX #14-24, for Unicode 5.2.0. + * linebreak.c: Update comments about UAX #14. + * linebreak.h: Ditto. + * linebreakdef.c: Ditto. + * linebreakdef.h: Ditto. + (LBP_CP): New enumerator for the new `CP' class as defined in + UAX #14-24. + * linebreak.c (baTable): Update for the new class `CP'. + * linebreakdata.c: Regenerate from LineBreak-5.2.0.txt. + * README: Update the reference to UAX #14-24, for Unicode 5.2.0. + +2009-05-03 Wu Yongwei + + * NEWS: Add information about the 1.2 release. + +2009-04-30 Wu Yongwei + + Optimize the Doxygen output. + * linebreak.c (lb_prop_index): Adjust its definition format + slightly. + +2009-04-30 Wu Yongwei + + * Doxyfile (USE_WINDOWS_ENCODING): Remove obsolete tag. + (DETAILS_AT_TOP): Ditto. + (MAX_DOT_GRAPH_WIDTH): Ditto. + (MAX_DOT_GRAPH_HEIGHT): Ditto. + (REFERENCED_BY_RELATION): Set to `NO'. + (REFERENCES_RELATION): Ditto. + (EXCLUDE): Add `filter_dup.c'. + +2009-04-28 Wu Yongwei + + * linebreak.c (lb_get_next_char_utf8): Fix the issue that the index + can point to the middle of a UTF-8 sequence if End of String (EOS) + is encountered prematurely (thanks to Nikolay Pultsin and Rick Xu). + (lb_get_next_char_utf16): Fix the issue that the index can point to + the middle of a UTF-16 surrogate pair if EOS is encountered + prematurely. + +2009-04-20 Wu Yongwei + + * linebreakdef.c (lb_prop_English): Remove the specialization of + right single quotation mark as closing punctuation mark, because it + can be used as apostrophe. + (lb_prop_Spanish): Ditto. + (lb_prop_French): Ditto. + +2009-04-09 Wu Yongwei + + * Makefile.msvc: Make the `clean' target work on MSVC versions other + than 6.0; do not use precompiled header. + +2009-03-07 Wu Yongwei + + * linebreak.h: Correct the wrong date in the documentation comment. + * linebreakdef.h: Ditto. + +2009-02-10 Wu Yongwei + + * configure.ac (AC_INIT): Increase the version to 2.0. + * Makefile.am (liblinebreak_la_LDFLAGS): Set the version-info to + `2:0'. + +2009-02-10 Wu Yongwei + + * linebreak.h (LINEBREAK_VERSION): New macro. + (linebreak_version): New global constant declaration. + * linebreak.c (linebreak_version): New global constant definition. + +2009-02-10 Wu Yongwei + + Reduce namespace pollution. + * linebreak.c (get_lb_prop_lang): Mark as static. + (get_next_char_utf8): Rename to lb_get_next_char_utf8. + (get_next_char_utf16): Rename to lb_get_next_char_utf32. + (get_next_char_utf32): Rename to lb_get_next_char_utf32. + (is_breakable): Rename to is_line_breakable. + * linebreak.h (get_next_char_utf8): Remove the function prototype + declaration. + (get_next_char_utf16): Ditto. + (get_next_char_utf32): Ditto. + (is_breakable): Rename to is_line_breakable. + * linebreakdef.h (lb_get_next_char_utf8): Add the function prototype + declaration. + (lb_get_next_char_utf16): Ditto. + (lb_get_next_char_utf32): Ditto. + +2009-02-06 Wu Yongwei + + * NEWS: Add information about the 1.1 release. + +2009-01-02 Wu Yongwei + + * Makefile.am (EXTRA_DIST): Add the missing `LICENCE' file. + +2008-12-31 Wu Yongwei + + * linebreak.c: Update the version number in comment to 1.0. + * linebreak.h: Ditto. + * linebreakdef.c: Ditto. + * linebreakdef.h: Ditto. + +2008-12-31 Wu Yongwei + + * NEWS: Update for the 1.0 release. + +2008-12-31 Wu Yongwei + + * README: Correct two typos. + +2008-12-31 Wu Yongwei + + * README: Add the online URL reference. + +2008-12-30 Wu Yongwei + + * README: Update the reference to UAX #14-22, for Unicode 5.1.0. + +2008-12-13 Wu Yongwei + + Update files according to UAX #14-22, for Unicode 5.1.0. + * linebreak.c (baTable): Update according to Table 2 of UAX #14-22. + * linebreakdef.c (lb_prop_Spanish): Remove the unnecessary + customization for inverted marks in Spanish. + * linebreakdata.c: Regenerate from LineBreak-5.1.0.txt. + * linebreak.h: Update comment only. + * linebreakdef.h: Ditto. + +2008-12-12 Wu Yongwei + + * README: Update for the new build methods and better readability. + +2008-12-12 Wu Yongwei + + * Makefile.msvc: Correct the inconsistent naming in the output + message. + +2008-12-12 Wu Yongwei + + * configure.ac (AM_INIT_AUTOMAKE): Mark `foreign'. + * bootstrap: New file. + * purge: New file. + * Makefile.gcc (purge): Remove this target. + +2008-12-10 Wu Yongwei + + * NEWS: New file. + +2008-12-10 Wu Yongwei + + * AUTHORS: New file. + +2008-12-10 Wu Yongwei + + * Makefile.gcc (purge): New phony target to purge files generated by + autoconfiscation. + +2008-12-10 Thomas Klausner + + * configure.ac: New file. + * Makefile.am: New file. + +2008-12-10 Wu Yongwei + + * Doxyfile (OUTPUT_DIRECTORY): Set to `doc'. + (ALPHABETICAL_INDEX): Set to `YES'. + +2008-12-09 Wu Yongwei + + * Makefile.msvc: New file. + +2008-12-09 Wu Yongwei + + * Makefile: Remove (to become Makefile.gcc). + * Makefile.gcc: New file (was Makefile). + +2008-12-07 Wu Yongwei + + * linebreak.c: Adjust the comment that refers to Unicode Annex 14. + * linebreak.h: Ditto. + * linebreakdef.c: Ditto. + * linebreakdef.h: Ditto. + +2008-12-07 Wu Yongwei + + Use only POSIX basic regexp to ensure maximum portability (issues + have been found on Mac OS X, where GNU extensions do not work). + * LineBreak1.sed: Replace `[:xdigit:]' with `0-9A-F', and `\+' with + `\{1,\}'. + * LineBreak2.sed: Ditto. + +2008-12-07 Wu Yongwei + + * Makefile: Replace `*.exe' with `filter_dup$(EXEEXT)', since the + extension `.exe' is specific to Windows. + +2008-04-20 Wu Yongwei + + Add README and LICENCE files, as well as a Doxyfile to generate + documents. + * README: New file. + * LICENCE: New file. + * Doxyfile: New file. + * Makefile (doc): Add new phony target. + +2008-04-04 Wu Yongwei + + Remove the English override for plus sign: it is better treated in + the text breaking program (see ../breaktext/ for an example). + * linebreakdef.c (lb_prop_English): Remove the line for plus sign. + +2008-03-29 Wu Yongwei + + * Makefile: Correct the dependency-making rules when OLDGCC=Y. + +2008-03-23 Wu Yongwei + + * Makefile (clean): Do not remove *.exe and tags here. + (distclean): Remove *.exe and tags. + +2008-03-23 Wu Yongwei + + Remove the English override for solidus: it is better treated in the + text breaking program (see ../breaktext/ for an example). + * linebreakdef.c (lb_prop_English): Remove the line for solidus. + +2008-03-16 Wu Yongwei + + Rename init_linebreak_prop_index to init_linebreak for future + safety; make visible certain functions that are potentially useful. + * linebreak.c (init_linebreak_prop_index): Rename to init_linebreak. + (get_next_char_t): Move to linebreakdef.h. + (get_next_char_utf8): Make non-static. + (get_next_char_utf16): Ditto. + (get_next_char_utf32): Ditto. + (set_linebreaks): Ditto. + * linebreak.h (init_linebreak_prop_index): Rename to init_linebreak. + (get_next_char_utf8): Add the function prototype. + (get_next_char_utf16): Ditto. + (get_next_char_utf32): Ditto. + * linebreakdef.h (get_next_char_t): Add the typedef. + (set_linebreaks): Add the function prototype. + +2008-03-16 Wu Yongwei + + * Makefile (OLDGCC): Add support for GCC 2.95.3 (when OLDGCC=Y). + +2008-03-15 Wu Yongwei + + * linebreak.c (set_linebreaks): Fix a bug that `==' was wrongly used + for `='. + +2008-03-05 Wu Yongwei + + Improve the performance by reducing the look-ups of the + language-specific line breaking properties array from the language + name (thanks to Nikolay Pultsin). + * linebreak.c (get_lb_prop_lang): New function. + (get_char_lb_class_lang): Change the second parameter from the + language name to the line breaking properties array. + (set_linebreaks): Look up the language-specific line breaking + properties array from the language name only once in one function + call. + +2008-03-03 Wu Yongwei + + Make minor adjustments in code and comments. + * linebreak.c: Adjust the doc comments. + (init_linebreak_prop_index): Modify a conditional to make it more + robust and consistent. + * linebreakdef.c (lb_prop_lang_map): Replace the pointer + lb_prop_default with NULL, since the value is never used. + +2008-03-03 Wu Yongwei + + Accelerate get_char_lb_class for invalid Unicode code points. + * linebreak.c (get_char_lb_class): Adjust the conditionals so that + getting the line breaking class for an invalid code point is much + faster, which requires the array of line breaking properties be + sorted. + * linebreakdef.h: Adjust a comment that the array of line break + properties must be sorted. + +2008-03-02 Wu Yongwei + + Change the values of enum BreakAction to more complete forms. + * linebreak.c (INDRCT_BRK): Rename to INDIRECT_BRK. + (CM_INDRCT_BRK): Rename to CM_INDIRECT_BRK. + (CM_PROHIBTD_BRK): Rename to CM_PROHIBITED_BRK. + (PROHIBTD_BRK): Rename to PROHIBITED_BRK. + +2008-03-02 Wu Yongwei + + Implement a two-stage search in get_char_lb_class_default to + accelerate the overall performance, especially for non-Latin + languages. + * linebreak.c (LINEBREAK_INDEX_SIZE): New constant macro. + (struct LineBreakPropertiesIndex): New struct. + (lb_prop_index): New static variable. + (init_linebreak_prop_index): New function. + (get_char_lb_class_default): New function. + (get_char_lb_class_lang): Use get_char_lb_class_default. + * linebreak.h: Detect C++ and add extern "C" guard if necessary. + (init_linebreak_prop_index): Add the prototype declaration. + * linebreakdef.h: Adjust a comment. + +2008-03-02 Wu Yongwei + + Split/refactor the code; add (doc) comments. + * Makefile (CFILES): Add linebreakdata.c and linebreakdef.c. + * linebreak.c: Add and adjust comments. + (linebreakdef.h): Add include file. + (linebreakdata.c): Remove include file. + (EOS): Remove (now in linebreakdef.h). + (enum LineBreakClass): Ditto. + (struct LineBreakProperties): Ditto. + (lbpEnglish): Remove (now in linebreakdef.c as lb_prop_English). + (lbpGerman): Remove (now in linebreakdef.c as lb_prop_German). + (lbpSpanish): Remove (now in linebreakdef.c as lb_prop_Spanish). + (lbpFrench): Remove (now in linebreakdef.c as lb_prop_French). + (lbpRussian): Remove (now in linebreakdef.c as lb_prop_Russian). + (lbpChinese): Remove (now in linebreakdef.c as lb_prop_Chinese). + (struct LineBreakPropertiesLang): Remove (now in linebreakdef.h). + (lbpLangs): Remove (now in linebreakdef.c as lb_prop_lang_map). + (get_next_char_utf16): Make sure memory access not go beyond len. + * linebreak.h: Add copyright information and adjust comments. + (stddef.h): Add include file. + * linebreakdata.c (linebreak.h): Add include file. + (linebreakdef.h): Add include file. + (lbpDefault): Make global and rename to lb_prop_default. + * linebreakdata2.tmpl: Add two include files, a comment line, and + remove `static'. + * linebreakdef.c: New file. + * linebreakdef.h: New file. + +2008-02-26 Wu Yongwei + + * linebreak.c (lbpSpanish): New array for Spanish-specific data. + (lbpLangs): Update the index array for Spanish. + (resolve_lb_class): Resolve AmbIguous class to IDeographic in + Chinese, Japanese, and Korean. + +2008-02-26 Wu Yongwei + + * Makefile (LineBreak.txt): Add new rule to retrieve it from the Web + if it is not already there. + +2008-02-23 Wu Yongwei + + Add files for linebreak. + * LineBreak1.sed: New file. + * LineBreak2.sed: New file. + * Makefile: New file. + * filter_dup.c: New file. + * linebreak.c: New file. + * linebreak.h: New file. + * linebreakdata.c: New file. + * linebreakdata1.tmpl: New file. + * linebreakdata2.tmpl: New file. + * linebreakdata3.tmpl: New file. diff --git a/linebreak/linebreak/Doxyfile b/linebreak/linebreak/Doxyfile new file mode 100644 index 0000000..d958af3 --- /dev/null +++ b/linebreak/linebreak/Doxyfile @@ -0,0 +1,1219 @@ +# Doxyfile 1.5.1 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project +# +# All text after a hash (#) is considered a comment and will be ignored +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" ") + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded +# by quotes) that should identify the project. + +PROJECT_NAME = libunibreak + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. +# This could be handy for archiving the generated documentation or +# if some version control system is used. + +PROJECT_NUMBER = 1.0 + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = doc + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create +# 4096 sub-directories (in 2 levels) under the output directory of each output +# format and will distribute the generated files over these directories. +# Enabling this option can be useful when feeding doxygen a huge amount of +# source files, where putting all generated files in the same directory would +# otherwise cause performance problems for the file system. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, +# Croatian, Czech, Danish, Dutch, Finnish, French, German, Greek, Hungarian, +# Italian, Japanese, Japanese-en (Japanese with English messages), Korean, +# Korean-en, Lithuanian, Norwegian, Polish, Portuguese, Romanian, Russian, +# Serbian, Slovak, Slovene, Spanish, Swedish, and Ukrainian. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to JavaDoc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator +# that is used to form the text in various listings. Each string +# in this list, if found as the leading text of the brief description, will be +# stripped from the text and the result after processing the whole list, is +# used as the annotated text. Otherwise, the brief description is used as-is. +# If left blank, the following values are used ("$name" is automatically +# replaced with the name of the entity): "The $name class" "The $name widget" +# "The $name file" "is" "provides" "specifies" "contains" +# "represents" "a" "an" "the" + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = YES + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user-defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the +# path to strip. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of +# the path mentioned in the documentation of a class, which tells +# the reader which header file to include in order to use a class. +# If left blank only the name of the header file containing the class +# definition is used. Otherwise one should specify the include paths that +# are normally passed to the compiler using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful is your file systems +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a JavaDoc-style +# comment as the brief description. If set to NO, the JavaDoc +# comments will behave just like the Qt-style comments (thus requiring an +# explicit @brief command for a brief description. + +JAVADOC_AUTOBRIEF = YES + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen +# treat a multi-line C++ special comment block (i.e. a block of //! or /// +# comments) as a brief description. This used to be the default behaviour. +# The new default is to treat a multi-line C++ comment block as a detailed +# description. Set this tag to YES if you prefer the old behaviour instead. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# re-implements. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce +# a new page for each member. If set to NO, the documentation of a member will +# be part of the file/class/namespace that contains it. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user-defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C +# sources only. Doxygen will then generate output that is more tailored for C. +# For instance, some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = YES + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java +# sources only. Doxygen will then generate output that is more tailored for Java. +# For instance, namespaces will be presented as packages, qualified scopes +# will look different, etc. + +OPTIMIZE_OUTPUT_JAVA = NO + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want to +# include (a tag file for) the STL sources as input, then you should +# set this tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. +# func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. + +BUILTIN_STL_SUPPORT = NO + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES (the default) to allow class member groups of +# the same type (for instance a group of public functions) to be put as a +# subgroup of that type (e.g. under the Public Functions section). Set it to +# NO to prevent subgrouping. Alternatively, this can be done per class using +# the \nosubgrouping command. + +SUBGROUPING = YES + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = YES + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. When set to YES local +# methods, which are defined in the implementation section but not in +# the interface are included in the documentation. +# If set to NO (the default) only methods in the interface are included. + +EXTRACT_LOCAL_METHODS = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these classes will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all +# friend (class|struct|union) declarations. +# If set to NO (the default) these declarations will be included in the +# documentation. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any +# documentation blocks found inside the body of a function. +# If set to NO (the default) these blocks will be appended to the +# function's detailed documentation block. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put a list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = YES + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the +# brief documentation of file, namespace and class members alphabetically +# by member name. If set to NO (the default) the members will appear in +# declaration order. + +SORT_BRIEF_DOCS = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be +# sorted by fully-qualified names, including namespaces. If set to +# NO (the default), the class list will be sorted only by class name, +# not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the +# alphabetical list. + +SORT_BY_SCOPE_NAME = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or +# disable (NO) the deprecated list. This list is created by putting +# \deprecated commands in the documentation. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if sectionname ... \endif. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or define consists of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and defines in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = YES + +# If the sources in your project are distributed over multiple directories +# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy +# in the documentation. The default is NO. + +SHOW_DIRECTORIES = NO + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from the +# version control system). Doxygen will invoke the program by executing (via +# popen()) the command , where is the value of +# the FILE_VERSION_FILTER tag, and is the name of an input file +# provided by doxygen. Whatever the program writes to standard output +# is used as the file version. See the manual for examples. + +FILE_VERSION_FILTER = + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = YES + +# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some +# parameters in a documented function, or documenting parameters that +# don't exist or using markup commands wrongly. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be abled to get warnings for +# functions that are documented, but have no documentation for their parameters +# or return value. If set to NO (the default) doxygen will only warn about +# wrong or incomplete parameter documentation, but not about the absence of +# documentation. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. Optionally the format may contain +# $version, which will be replaced by the version of the file (if it could +# be obtained via FILE_VERSION_FILTER) + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +INPUT = + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx +# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py + +FILE_PATTERNS = + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. + +EXCLUDE = filter_dup.c + +# The EXCLUDE_SYMLINKS tag can be used select whether or not files or +# directories that are symbolic links (a Unix filesystem feature) are excluded +# from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. Note that the wildcards are matched +# against the file with absolute path, so to exclude all test directories +# for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command , where +# is the value of the INPUT_FILTER tag, and is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. If FILTER_PATTERNS is specified, this tag will be +# ignored. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: +# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further +# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER +# is applied to all files. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse (i.e. when SOURCE_BROWSER is set to YES). + +FILTER_SOURCE_FILES = NO + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. +# Note: To get rid of all source code in the generated output, make sure also +# VERBATIM_HEADERS is set to NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C and C++ comments will always remain visible. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES (the default) +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES (the default) +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) +# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from +# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will +# link to the source code. Otherwise they will link to the documentstion. + +REFERENCES_LINK_SOURCE = YES + +# If the USE_HTAGS tag is set to YES then the references to source code +# will point to the HTML generated by the htags(1) tool instead of doxygen +# built-in source browser. The htags tool is part of GNU's global source +# tagging system (see http://www.gnu.org/software/global/global.html). You +# will need version 4.8.6 or higher. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = YES + +# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then +# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns +# in which this list will be split (can be a number in the range [1..20]) + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If the tag is left blank doxygen +# will generate a default style sheet. Note that doxygen will try to copy +# the style sheet file to the HTML output directory, so don't put your own +# stylesheet in the HTML output directory as well, or it will be erased! + +HTML_STYLESHEET = + +# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, +# files or namespaces will be aligned in HTML using tables. If set to +# NO a bullet list will be used. + +HTML_ALIGN_MEMBERS = YES + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compressed HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can +# be used to specify the file name of the resulting .chm file. You +# can add a path in front of the file if the result should not be +# written to the html output directory. + +CHM_FILE = + +# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can +# be used to specify the location (absolute path including file name) of +# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run +# the HTML help compiler on the generated index.hhp. + +HHC_LOCATION = + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the master .chm file (NO). + +GENERATE_CHI = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the HTML help documentation and to the tree view. + +TOC_EXPAND = NO + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index at +# top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. + +DISABLE_INDEX = NO + +# This tag can be used to set the number of enum values (range [1..20]) +# that doxygen will group on one line in the generated HTML documentation. + +ENUM_VALUES_PER_LINE = 4 + +# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be +# generated containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# JavaScript, DHTML, CSS and frames is required (for instance Mozilla 1.0+, +# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are +# probably better off using the HTML help feature. + +GENERATE_TREEVIEW = NO + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = NO + +# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = latex + +# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be +# invoked. If left blank `latex' will be used as the default command name. + +LATEX_CMD_NAME = latex + +# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to +# generate index for LaTeX. If left blank `makeindex' will be used as the +# default command name. + +MAKEINDEX_CMD_NAME = makeindex + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# LaTeX documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = NO + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, a4wide, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = a4wide + +# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX +# packages that should be included in the LaTeX output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal LaTeX header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = + +# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = NO + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = NO + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated LaTeX files. This will instruct LaTeX to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +# If LATEX_HIDE_INDICES is set to YES then doxygen will not +# include the index chapters (such as File Index, Compound Index, etc.) +# in the output. + +LATEX_HIDE_INDICES = NO + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimized for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = rtf + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = NO + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = man + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = .3 + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = NO + +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- + +# If the GENERATE_XML tag is set to YES Doxygen will +# generate an XML file that captures the structure of +# the code including all documentation. + +GENERATE_XML = NO + +# The XML_OUTPUT tag is used to specify where the XML pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `xml' will be used as the default path. + +XML_OUTPUT = xml + +# The XML_SCHEMA tag can be used to specify an XML schema, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_SCHEMA = + +# The XML_DTD tag can be used to specify an XML DTD, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_DTD = + +# If the XML_PROGRAMLISTING tag is set to YES Doxygen will +# dump the program listings (including syntax highlighting +# and cross-referencing information) to the XML output. Note that +# enabling this will significantly increase the size of the XML output. + +XML_PROGRAMLISTING = YES + +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an AutoGen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- + +# If the GENERATE_PERLMOD tag is set to YES Doxygen will +# generate a Perl module file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_PERLMOD = NO + +# If the PERLMOD_LATEX tag is set to YES Doxygen will generate +# the necessary Makefile rules, Perl scripts and LaTeX code to be able +# to generate PDF and DVI output from the Perl module output. + +PERLMOD_LATEX = NO + +# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be +# nicely formatted so it can be parsed by a human reader. This is useful +# if you want to understand what is going on. On the other hand, if this +# tag is set to NO the size of the Perl module output will be much smaller +# and Perl will parse it just the same. + +PERLMOD_PRETTY = YES + +# The names of the make variables in the generated doxyrules.make file +# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. +# This is useful so different doxyrules.make files included by the same +# Makefile don't overwrite each other's variables. + +PERLMOD_MAKEVAR_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = NO + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_DEFINED tags. + +EXPAND_ONLY_PREDEF = NO + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# in the INCLUDE_PATH (see below) will be search if a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. To prevent a macro definition from being +# undefined via #undef or recursively expanded use the := operator +# instead of the = operator. + +PREDEFINED = + +# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition. + +EXPAND_AS_DEFINED = + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all function-like macros that are alone +# on a line, have an all uppercase name, and do not end with a semicolon. Such +# function macros are typically used for boiler-plate code, and will confuse +# the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES option can be used to specify one or more tagfiles. +# Optionally an initial location of the external documentation +# can be added for each tagfile. The format of a tag file without +# this location is as follows: +# TAGFILES = file1 file2 ... +# Adding location for the tag files is done as follows: +# TAGFILES = file1=loc1 "file2 = loc2" ... +# where "loc1" and "loc2" can be relative or absolute paths or +# URLs. If a location is present for each tag, the installdox tool +# does not have to be run to correct the links. +# Note that each tag file must have a unique name +# (where the name does NOT include the path) +# If a tag file is not located in the directory in which doxygen +# is run, you must also specify the path to the tagfile here. + +TAGFILES = + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = NO + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = YES + +# The PERL_PATH should be the absolute path and name of the perl script +# interpreter (i.e. the result of `which perl'). + +PERL_PATH = /usr/bin/perl + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will +# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base +# or super classes. Setting the tag to NO turns the diagrams off. Note that +# this option is superseded by the HAVE_DOT option below. This is only a +# fallback. It is recommended to install and use dot, since it yields more +# powerful graphs. + +CLASS_DIAGRAMS = YES + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = YES + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = YES + +# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect inheritance relations. Setting this tag to YES will force the +# the CLASS_DIAGRAMS tag to NO. + +CLASS_GRAPH = YES + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = YES + +# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for groups, showing the direct groups dependencies + +GROUP_GRAPHS = YES + +# If the UML_LOOK tag is set to YES doxygen will generate inheritance and +# collaboration diagrams in a style similar to the OMG's Unified Modeling +# Language. + +UML_LOOK = NO + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = NO + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = YES + +# If the CALL_GRAPH and HAVE_DOT tags are set to YES then doxygen will +# generate a call dependency graph for every global function or class method. +# Note that enabling this option will significantly increase the time of a run. +# So in most cases it will be better to enable call graphs for selected +# functions only using the \callgraph command. + +CALL_GRAPH = NO + +# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then doxygen will +# generate a caller dependency graph for every global function or class method. +# Note that enabling this option will significantly increase the time of a run. +# So in most cases it will be better to enable caller graphs for selected +# functions only using the \callergraph command. + +CALLER_GRAPH = NO + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES +# then doxygen will show the dependencies a directory has on other directories +# in a graphical way. The dependency relations are determined by the #include +# relations between the files in the directories. + +DIRECTORY_GRAPH = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are png, jpg, or gif +# If left blank png will be used. + +DOT_IMAGE_FORMAT = png + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found in the path. + +DOT_PATH = + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the +# graphs generated by dot. A depth value of 3 means that only nodes reachable +# from the root by following a path via at most 3 edges will be shown. Nodes +# that lay further from the root node will be omitted. Note that setting this +# option to 1 or 2 may greatly reduce the computation time needed for large +# code bases. Also note that a graph may be further truncated if the graph's +# image dimensions are not sufficient to fit the graph (see MAX_DOT_GRAPH_WIDTH +# and MAX_DOT_GRAPH_HEIGHT). If 0 is used for the depth value (the default), +# the graph is not depth-constrained. + +MAX_DOT_GRAPH_DEPTH = 0 + +# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent +# background. This is disabled by default, which results in a white background. +# Warning: Depending on the platform used, enabling this option may lead to +# badly anti-aliased labels on the edges of a graph (i.e. they become hard to +# read). + +DOT_TRANSPARENT = NO + +# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output +# files in one run (i.e. multiple -o and -T options on the command line). This +# makes dot run faster, but since only newer versions of dot (>1.8.10) +# support this, this feature is disabled by default. + +DOT_MULTI_TARGETS = NO + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermediate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to the search engine +#--------------------------------------------------------------------------- + +# The SEARCHENGINE tag specifies whether or not a search engine should be +# used. If set to NO the values of all tags below this one will be ignored. + +SEARCHENGINE = NO diff --git a/linebreak/linebreak/LICENCE b/linebreak/linebreak/LICENCE new file mode 100644 index 0000000..ceec155 --- /dev/null +++ b/linebreak/linebreak/LICENCE @@ -0,0 +1,19 @@ +Copyright (C) 2008-2012 Wu Yongwei +Copyright (C) 2012 Tom Hacohen + +This software is provided 'as-is', without any express or implied +warranty. In no event will the author be held liable for any damages +arising from the use of this software. + +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it +freely, subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgement in the product documentation would + be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not + be misrepresented as being the original software. +3. This notice may not be removed or altered from any source + distribution. diff --git a/linebreak/linebreak/LineBreak1.sed b/linebreak/linebreak/LineBreak1.sed new file mode 100644 index 0000000..1be9651 --- /dev/null +++ b/linebreak/linebreak/LineBreak1.sed @@ -0,0 +1 @@ +s/\(^[0-9A-F.]\{1,\};[A-Z][A-Z0-9]\) #.*/\1/p diff --git a/linebreak/linebreak/LineBreak2.sed b/linebreak/linebreak/LineBreak2.sed new file mode 100644 index 0000000..8165958 --- /dev/null +++ b/linebreak/linebreak/LineBreak2.sed @@ -0,0 +1,2 @@ +s/^\([0-9A-F]\{1,\}\);/\1..\1;/ +s/^\([0-9A-F]\{1,\}\)\.\.\([0-9A-F]\{1,\}\);\([A-Z][A-Z0-9]\)/ { 0x\1, 0x\2, LBP_\3 },/ diff --git a/linebreak/linebreak/Makefile.am b/linebreak/linebreak/Makefile.am new file mode 100644 index 0000000..2810509 --- /dev/null +++ b/linebreak/linebreak/Makefile.am @@ -0,0 +1,63 @@ +#noinst_PROGRAMS = filter_dup +include_HEADERS = linebreak.h linebreakdef.h wordbreak.h wordbreakdef.h +lib_LTLIBRARIES = libunibreak.la +pkgconfig_DATA = libunibreak.pc +pkgconfigdir = ${libdir}/pkgconfig + +libunibreak_la_LDFLAGS = -no-undefined -version-info 1:0 +libunibreak_la_SOURCES = \ + linebreak.c \ + linebreakdata.c \ + linebreakdef.c \ + wordbreak.c + +EXTRA_DIST = \ + LineBreak1.sed \ + LineBreak2.sed \ + linebreakdata1.tmpl \ + linebreakdata2.tmpl \ + linebreakdata3.tmpl \ + wordbreakdata1.tmpl \ + wordbreakdata2.tmpl \ + wordbreakdata.c \ + LICENCE \ + Doxyfile \ + Makefile.gcc \ + Makefile.msvc \ + doc \ + sort_numeric_hex.py + +install-exec-hook: + rm -f ${libdir}/liblinebreak.a + ${LN_S} ${libdir}/libunibreak.a ${libdir}/liblinebreak.a + +distclean-local: + rm -f LineBreak.txt WordBreakData.txt filter_dup${EXEEXT} + +doc: + cd ${top_srcdir} && doxygen + +LineBreak.txt: + wget http://unicode.org/Public/UNIDATA/LineBreak.txt + +WordBreakProperty.txt: + wget http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt + +linebreakdata: ${builddir}/filter_dup LineBreak.txt + sed -n -f ${top_srcdir}/LineBreak1.sed LineBreak.txt > tmp.txt + sed -f ${top_srcdir}/LineBreak2.sed tmp.txt | ${builddir}/filter_dup > tmp.c + head -2 LineBreak.txt > tmp.txt + cat ${top_srcdir}/linebreakdata1.tmpl tmp.txt ${top_srcdir}/linebreakdata2.tmpl tmp.c ${top_srcdir}/linebreakdata3.tmpl > ${top_srcdir}/linebreakdata.c + rm tmp.txt tmp.c + +wordbreakdata: WordBreakProperty.txt + sed -E -n 's/(^[0-9A-F.]+)/\1/p' WordBreakProperty.txt > tmp2.txt + sed -E -i.bak 's/^([0-9A-F]+) +/\1..\1/' tmp2.txt + ${top_srcdir}/sort_numeric_hex.py tmp2.txt > tmp.txt + rm tmp2.txt tmp2.txt.bak + sed -E -i.bak -n 's/^([0-9A-F]+)..([0-9A-F]+) *; *([A-Za-z]+).*/'$$'\t''{0x\1, 0x\2, WBP_\3},/p' tmp.txt + echo "/* The content of this file is generated from:" > ${top_srcdir}/wordbreakdata.c + head -2 WordBreakProperty.txt >> ${top_srcdir}/wordbreakdata.c + echo "*/" >> ${top_srcdir}/wordbreakdata.c + cat ${top_srcdir}/wordbreakdata1.tmpl tmp.txt ${top_srcdir}/wordbreakdata2.tmpl >> ${top_srcdir}/wordbreakdata.c + rm tmp.txt tmp.txt.bak diff --git a/linebreak/linebreak/Makefile.gcc b/linebreak/linebreak/Makefile.gcc new file mode 100644 index 0000000..7b3c5ed --- /dev/null +++ b/linebreak/linebreak/Makefile.gcc @@ -0,0 +1,177 @@ +# Windows/Cygwin support +ifdef windir + WINDOWS := 1 + CYGWIN := 0 +else + ifdef WINDIR + WINDOWS := 1 + CYGWIN := 1 + else + WINDOWS := 0 + endif +endif +ifeq ($(WINDOWS),1) + EXEEXT := .exe + DLLEXT := .dll + DEVNUL := nul + ifeq ($(CYGWIN),1) + PATHSEP := / + else + PATHSEP := $(strip \ ) + endif +else + EXEEXT := + DLLEXT := .so + DEVNUL := /dev/null + PATHSEP := / +endif + +CFG ?= Debug +ifeq ($(CFG),Debug) + all: debug +else + all: release +endif + +OLDGCC ?= N + +DEBUG := DebugDir +RELEASE := ReleaseDir + +$(DEBUG)/%.o: %.c + $(CC) $(CFLAGS) $(CPPFLAGS) $(DBGFLAGS) $(TARGET_ARCH) -c -o $@ $< + +$(RELEASE)/%.o: %.c + $(CC) $(CFLAGS) $(CPPFLAGS) $(RELFLAGS) $(TARGET_ARCH) -c -o $@ $< + +$(DEBUG)/%.o: %.cpp + $(CXX) $(CXXFLAGS) $(CPPFLAGS) $(DBGFLAGS) $(TARGET_ARCH) -c -o $@ $< + +$(RELEASE)/%.o: %.cpp + $(CXX) $(CXXFLAGS) $(CPPFLAGS) $(RELFLAGS) $(TARGET_ARCH) -c -o $@ $< + +ifeq ($(OLDGCC),N) + +$(DEBUG)/%.dep: %.c + $(CC) -MM -MT $(patsubst %.dep,%.o,$@) $(CFLAGS) $(CPPFLAGS) $(DBGFLAGS) $(TARGET_ARCH) -o $@ $< + +$(RELEASE)/%.dep: %.c + $(CC) -MM -MT $(patsubst %.dep,%.o,$@) $(CFLAGS) $(CPPFLAGS) $(RELFLAGS) $(TARGET_ARCH) -o $@ $< + +$(DEBUG)/%.dep: %.cpp + $(CXX) -MM -MT $(patsubst %.dep,%.o,$@) $(CXXFLAGS) $(CPPFLAGS) $(DBGFLAGS) $(TARGET_ARCH) -o $@ $< + +$(RELEASE)/%.dep: %.cpp + $(CXX) -MM -MT $(patsubst %.dep,%.o,$@) $(CXXFLAGS) $(CPPFLAGS) $(RELFLAGS) $(TARGET_ARCH) -o $@ $< + +else + +$(DEBUG)/%.dep: %.c + $(CC) -MM $(CFLAGS) $(CPPFLAGS) $(DBGFLAGS) $(TARGET_ARCH) $< | sed "s!^!$(DEBUG)/!" > $@ + +$(RELEASE)/%.dep: %.c + $(CC) -MM $(CFLAGS) $(CPPFLAGS) $(RELFLAGS) $(TARGET_ARCH) $< | sed "s!^!$(RELEASE)/!" > $@ + +$(DEBUG)/%.dep: %.cpp + $(CXX) -MM $(CXXFLAGS) $(CPPFLAGS) $(DBGFLAGS) $(TARGET_ARCH) $< | sed "s!^!$(DEBUG)/!" > $@ + +$(RELEASE)/%.dep: %.cpp + $(CXX) -MM $(CXXFLAGS) $(CPPFLAGS) $(RELFLAGS) $(TARGET_ARCH) $< | sed "s!^!$(RELEASE)/!" > $@ + +endif + +CC = gcc +CXX = g++ +AR = ar +LD = $(CXX) $(CXXFLAGS) $(TARGET_ARCH) + +INCLUDE = -I. $(patsubst %,-I%,$(VPATH)) +CFLAGS = -W -Wall $(INCLUDE) +CXXFLAGS = $(CFLAGS) +DBGFLAGS = -D_DEBUG -g +RELFLAGS = -DNDEBUG -O2 +CPPFLAGS = + +ifeq ($(OLDGCC),N) + CFLAGS += -fmessage-length=0 +endif + +HFILES = $(wildcard $(patsubst -I%,%/*.h,$(INCLUDE))) +OBJFILES = $(CFILES:.c=.o) $(CXXFILES:.cpp=.o) + +DEBUG_OBJS = $(patsubst %.o,$(DEBUG)/%.o,$(OBJFILES)) +RELEASE_OBJS = $(patsubst %.o,$(RELEASE)/%.o,$(OBJFILES)) + +DEBUG_DEPS = $(patsubst %.o,%.dep,$(DEBUG_OBJS)) +RELEASE_DEPS = $(patsubst %.o,%.dep,$(RELEASE_OBJS)) + +CFILES := linebreak.c linebreakdata.c linebreakdef.c wordbreak.c +CXXFILES := + +LIBS := + +TARGET = liblinebreak.a +DEBUG_TARGET = $(patsubst %,$(DEBUG)/%,$(TARGET)) +RELEASE_TARGET = $(patsubst %,$(RELEASE)/%,$(TARGET)) + +debug: $(DEBUG) $(DEBUG_TARGET) + +release: $(RELEASE) $(RELEASE_TARGET) + + + +$(DEBUG): + mkdir $(DEBUG) + +$(RELEASE): + mkdir $(RELEASE) + +$(DEBUG_TARGET): $(DEBUG_DEPS) $(DEBUG_OBJS) + $(AR) -r $(DEBUG_TARGET) $(DEBUG_OBJS) + +$(RELEASE_TARGET): $(RELEASE_DEPS) $(RELEASE_OBJS) + $(AR) -r $(RELEASE_TARGET) $(RELEASE_OBJS) + +doc: + doxygen + +linebreakdata: filter_dup$(EXEEXT) LineBreak.txt + sed -n -f LineBreak1.sed LineBreak.txt > tmp.txt + sed -f LineBreak2.sed tmp.txt | .$(PATHSEP)filter_dup > tmp.c + head -2 LineBreak.txt > tmp.txt + cat linebreakdata1.tmpl tmp.txt linebreakdata2.tmpl tmp.c linebreakdata3.tmpl > linebreakdata.c + $(RM) tmp.txt tmp.c + +wordbreakdata: WordBreakProperty.txt + sed -E -n 's/(^[0-9A-F.]+)/\1/p' WordBreakProperty.txt > tmp2.txt + sed -E -i.bak 's/^([0-9A-F]+) +/\1..\1/' tmp2.txt + ./sort_numeric_hex.py tmp2.txt > tmp.txt + rm tmp2.txt tmp2.txt.bak + sed -E -i.bak -n 's/^([0-9A-F]+)..([0-9A-F]+) *; *([A-Za-z]+).*/'$$'\t''{0x\1, 0x\2, WBP_\3},/p' tmp.txt + echo "/* The content of this file is generated from:" > wordbreakdata.c + head -2 WordBreakProperty.txt >> wordbreakdata.c + echo "*/" >> wordbreakdata.c + cat wordbreakdata1.tmpl tmp.txt wordbreakdata2.tmpl >> wordbreakdata.c + rm tmp.txt tmp.txt.bak + +filter_dup$(EXEEXT): filter_dup.c + gcc -O2 -o filter_dup$(EXEEXT) $< + +LineBreak.txt: + wget http://unicode.org/Public/UNIDATA/LineBreak.txt + +WordBreakProperty.txt: + wget http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt + +.PHONY: all debug release clean distclean doc linebreakdata wordbreakdata + +clean: + $(RM) $(DEBUG)/*.o $(DEBUG)/*.dep $(DEBUG_TARGET) + $(RM) $(RELEASE)/*.o $(RELEASE)/*.dep $(RELEASE_TARGET) + +distclean: clean + $(RM) $(DEBUG)/* $(RELEASE)/* filter_dup$(EXEEXT) tags LineBreak.txt + -rmdir $(DEBUG) 2> $(DEVNUL) + -rmdir $(RELEASE) 2> $(DEVNUL) + +-include $(wildcard $(DEBUG)/*.dep) $(wildcard $(RELEASE)/*.dep) diff --git a/linebreak/linebreak/Makefile.msvc b/linebreak/linebreak/Makefile.msvc new file mode 100644 index 0000000..4e71ad0 --- /dev/null +++ b/linebreak/linebreak/Makefile.msvc @@ -0,0 +1,189 @@ +# Makefile for Microsoft Visual C++ and NMAKE + +!IF "$(CFG)" == "" +CFG=libunibreak - Win32 Debug +!MESSAGE No configuration specified. Defaulting to libunibreak - Win32 Debug. +!ENDIF + +!IF "$(CFG)" != "libunibreak - Win32 Release" && "$(CFG)" != "libunibreak - Win32 Debug" +!MESSAGE Invalid configuration "$(CFG)" specified. +!MESSAGE You can specify a configuration when running NMAKE +!MESSAGE by defining the macro CFG on the command line. For example: +!MESSAGE +!MESSAGE NMAKE /f Makefile.msvc CFG="libunibreak - Win32 Debug" +!MESSAGE +!MESSAGE Possible choices for configuration are: +!MESSAGE +!MESSAGE "libunibreak - Win32 Release" (based on "Win32 (x86) Static Library") +!MESSAGE "libunibreak - Win32 Debug" (based on "Win32 (x86) Static Library") +!MESSAGE +!ERROR An invalid configuration is specified. +!ENDIF + +!IF "$(OS)" == "Windows_NT" +NULL= +!ELSE +NULL=nul +!ENDIF + +CPP=cl.exe +RSC=rc.exe + +!IF "$(CFG)" == "libunibreak - Win32 Release" + +OUTDIR=.\Release +INTDIR=.\Release +# Begin Custom Macros +OutDir=.\Release +# End Custom Macros + +ALL : "$(OUTDIR)\unibreak.lib" + + +CLEAN : + -@erase "$(INTDIR)\linebreak.obj" + -@erase "$(INTDIR)\linebreakdata.obj" + -@erase "$(INTDIR)\linebreakdef.obj" + -@erase "$(INTDIR)\wordbreak.obj" + -@erase "$(INTDIR)\vc*.idb" + -@erase "$(OUTDIR)\unibreak.lib" + +"$(OUTDIR)" : + if not exist "$(OUTDIR)/$(NULL)" mkdir "$(OUTDIR)" + +CPP_PROJ=/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_MBCS" /D "_LIB" /Fo"$(INTDIR)\\" /Fd"$(INTDIR)\\" /FD /c +BSC32=bscmake.exe +BSC32_FLAGS=/nologo /o"$(OUTDIR)\unibreak.bsc" +BSC32_SBRS= \ + +LIB32=link.exe -lib +LIB32_FLAGS=/nologo /out:"$(OUTDIR)\unibreak.lib" +LIB32_OBJS= \ + "$(INTDIR)\linebreak.obj" \ + "$(INTDIR)\linebreakdata.obj" \ + "$(INTDIR)\linebreakdef.obj" \ + "$(INTDIR)\wordbreak.obj" + +"$(OUTDIR)\unibreak.lib" : "$(OUTDIR)" $(DEF_FILE) $(LIB32_OBJS) + $(LIB32) @<< + $(LIB32_FLAGS) $(DEF_FLAGS) $(LIB32_OBJS) +<< + +!ELSEIF "$(CFG)" == "libunibreak - Win32 Debug" + +OUTDIR=.\Debug +INTDIR=.\Debug +# Begin Custom Macros +OutDir=.\Debug +# End Custom Macros + +ALL : "$(OUTDIR)\unibreak.lib" + + +CLEAN : + -@erase "$(INTDIR)\linebreak.obj" + -@erase "$(INTDIR)\linebreakdata.obj" + -@erase "$(INTDIR)\linebreakdef.obj" + -@erase "$(INTDIR)\wordbreak.obj" + -@erase "$(INTDIR)\vc*.idb" + -@erase "$(INTDIR)\vc*.pdb" + -@erase "$(OUTDIR)\unibreak.lib" + +"$(OUTDIR)" : + if not exist "$(OUTDIR)/$(NULL)" mkdir "$(OUTDIR)" + +CPP_PROJ=/nologo /MLd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_MBCS" /D "_LIB" /Fo"$(INTDIR)\\" /Fd"$(INTDIR)\\" /FD /GZ /c +BSC32=bscmake.exe +BSC32_FLAGS=/nologo /o"$(OUTDIR)\unibreak.bsc" +BSC32_SBRS= \ + +LIB32=link.exe -lib +LIB32_FLAGS=/nologo /out:"$(OUTDIR)\unibreak.lib" +LIB32_OBJS= \ + "$(INTDIR)\linebreak.obj" \ + "$(INTDIR)\linebreakdata.obj" \ + "$(INTDIR)\linebreakdef.obj" \ + "$(INTDIR)\wordbreak.obj" + +"$(OUTDIR)\unibreak.lib" : "$(OUTDIR)" $(DEF_FILE) $(LIB32_OBJS) + $(LIB32) @<< + $(LIB32_FLAGS) $(DEF_FLAGS) $(LIB32_OBJS) +<< + +!ENDIF + +.c{$(INTDIR)}.obj:: + $(CPP) @<< + $(CPP_PROJ) $< +<< + +.cpp{$(INTDIR)}.obj:: + $(CPP) @<< + $(CPP_PROJ) $< +<< + +.cxx{$(INTDIR)}.obj:: + $(CPP) @<< + $(CPP_PROJ) $< +<< + +.c{$(INTDIR)}.sbr:: + $(CPP) @<< + $(CPP_PROJ) $< +<< + +.cpp{$(INTDIR)}.sbr:: + $(CPP) @<< + $(CPP_PROJ) $< +<< + +.cxx{$(INTDIR)}.sbr:: + $(CPP) @<< + $(CPP_PROJ) $< +<< + + +.\linebreak.c : \ + ".\linebreak.h"\ + ".\linebreakdef.h"\ + +.\linebreakdata.c : \ + ".\linebreak.h"\ + ".\linebreakdef.h"\ + +.\linebreakdef.c : \ + ".\linebreak.h"\ + ".\linebreakdef.h"\ + +.\wordbreak.c : \ + ".\linebreak.h"\ + ".\linebreakdef.h"\ + ".\wordbreak.h"\ + ".\wordbreakdef.h"\ + ".\wordbreakdata.c"\ + + +!IF "$(CFG)" == "libunibreak - Win32 Release" || "$(CFG)" == "libunibreak - Win32 Debug" +SOURCE=.\linebreak.c + +"$(INTDIR)\linebreak.obj" : $(SOURCE) "$(INTDIR)" + + +SOURCE=.\linebreakdata.c + +"$(INTDIR)\linebreakdata.obj" : $(SOURCE) "$(INTDIR)" + + +SOURCE=.\linebreakdef.c + +"$(INTDIR)\linebreakdef.obj" : $(SOURCE) "$(INTDIR)" + + +SOURCE=.\wordbreak.c + +"$(INTDIR)\wordbreak.obj" : $(SOURCE) "$(INTDIR)" + + + +!ENDIF + diff --git a/linebreak/linebreak/NEWS b/linebreak/linebreak/NEWS new file mode 100644 index 0000000..581cab7 --- /dev/null +++ b/linebreak/linebreak/NEWS @@ -0,0 +1,49 @@ +New in libunibreak 1.0 + +- Add word breaking support +- Change the library name to "libunibreak", while keeping maximum compatibility +- Add pkg-config support + +New in liblinebreak 2.1 + +- Update the data according to LineBreak-6.0.0.txt +- Fix the bug that an assertion in code can fail if U+FFFC is + encountered at the beginning of a line + +New in liblinebreak 2.0 + +- Update the algorithm and data according to UAX #14-24 and + LineBreak-5.2.0.txt +- Rename some functions to reduce namespace pollution +- Make Doxygen documentation better + +New in liblinebreak 1.2 + +- Fix the bug that an assertion in code can fail if an invalid UTF-8 or + UTF-16 sequence is encountered near the end of input +- Remove the specialization of right single quotation mark as closing + punctuation mark in English, French, and Spanish, because it can be + used as apostrophe +- Make Doxygen documentation better + +New in liblinebreak 1.1 + +- Make get_lb_prop_lang static and not an exported symbol +- Define is_line_breakable to alias to is_breakable +- Declare get_next_char_utf* will be changed to lb_get_next_char_utf* +- Move the declarations of get_next_char_utf* from linebreak.h to + linebreakdef.h +- Add the function documentation comments to the header files + +New in liblinebreak 1.0 + +- Update the line breaking data according to UAX #14-22 and + LineBreak-5.1.0.txt +- Add autoconfiscation support (./configure, make, make install) +- Add Makefile for MSVC + +First public release (0.9.6, or 20080421) + +- Implement line breaking algorithm according to UAX #14-19 +- Line breaking data is generated from LineBreak-5.0.0.txt +- Makefile only supports GCC diff --git a/linebreak/linebreak/README b/linebreak/linebreak/README new file mode 100644 index 0000000..bdb8972 --- /dev/null +++ b/linebreak/linebreak/README @@ -0,0 +1,88 @@ + L I B U N I B R E A K + ===================== + +Overview +-------- + +This is the README file for libunibreak, an implementation of the line +breaking and word breaking algorithms as described in Unicode +Standard Annex 14 and Unicode Standard Annex 29, available at + + + +Check this URL for up-to-date information: + + + +Licence +------- + +This library is released under an open-source licence, the zlib/libpng +licence. Please check the file LICENCE for details. + +Apart from using the algorithm, part of the code is derived from the +data provided under + + +And the Unicode Terms of Use may apply: + + + +Installation +------------ + +There are three ways to build the library: + +1) On *NIX systems supported by the autoconfiscation tools, do the + normal + + ./configure + make + sudo make install + + to build and install both the dynamic and static libraries. In + addition, one may + + - type `make doc' to generate the doxygen documentation; or + - type `make linebreakdata' to regenerate linebreakdata.c from + LineBreak.txt. + - type ‘make wordbreakdata’ to regenerate wordbreakdata.c from + WordBreakProperty.txt. + +2) On systems where GCC and Binutils are supported, one can type + + cp -p Makefile.gcc Makefile + make + + to build the static library. In addition, one may + + - type `make debug' or `make release' to explicitly generate the + debug or release build; + - type `make doc' to generate the doxygen documentation; or + - type `make linebreakdata' to regenerate linebreakdata.c from + LineBreak.txt. + - type ‘make wordbreakdata’ to regenerate wordbreakdata.c from + WordBreakProperty.txt. + +3) On Windows, apart from using method 1 (Cygwin/MSYS) and method 2 + (MinGW), MSVC can also be used. Type + + nmake -f Makefile.msvc + + to build the static library. By default the debug release is built. + To build the release version + + nmake -f Makefile.msvc CFG="libunibreak - Win32 Release" + + +Documentation +------------- + +Check the generated document doc/html/linebreak_8h.html and +doc/html/wordbreak_8h.html in the downloaded file for the public +interfaces exposed to applications. + + +$Id: README,v 1.8 2012/08/11 06:55:18 adah Exp $ + +vim:autoindent:expandtab:formatoptions=tcqlmn:textwidth=72: diff --git a/linebreak/linebreak/bootstrap b/linebreak/linebreak/bootstrap new file mode 100755 index 0000000..94173fd --- /dev/null +++ b/linebreak/linebreak/bootstrap @@ -0,0 +1,6 @@ +#! /bin/sh +aclocal && \ +autoheader && \ +autoconf && \ +libtoolize && \ +automake --add-missing diff --git a/linebreak/linebreak/configure.ac b/linebreak/linebreak/configure.ac new file mode 100644 index 0000000..4505064 --- /dev/null +++ b/linebreak/linebreak/configure.ac @@ -0,0 +1,12 @@ +AC_PREREQ(2.57) +AC_INIT([libunibreak],[1.0],[wuyongwei@gmail.com]) +AC_CONFIG_SRCDIR([linebreak.c]) +AC_CONFIG_HEADERS([config.h]) +AM_INIT_AUTOMAKE([foreign]) + +AC_PROG_CC +AC_PROG_LN_S +AC_EXEEXT +AM_PROG_LIBTOOL +AC_CONFIG_FILES([Makefile]) +AC_OUTPUT([libunibreak.pc]) diff --git a/linebreak/linebreak/filter_dup.c b/linebreak/linebreak/filter_dup.c new file mode 100644 index 0000000..a25ac70 --- /dev/null +++ b/linebreak/linebreak/filter_dup.c @@ -0,0 +1,47 @@ +#include +#include + +int main() +{ + char s[80]; + char beg[16]; + char end[16]; + char prop[16]; + char lastbeg[16]; + char lastend[16]; + char lastprop[16]; + lastprop[0] = 0; + for (;;) + { + if (fgets(s, sizeof s, stdin) == NULL) + break; + if (strstr(s, "LBP_") == NULL || strstr(s, "LBP_Undef") != NULL) + { + if (lastprop[0]) + { + printf("\t{ %s %s %s },\n", lastbeg, lastend, lastprop); + lastprop[0] = 0; + } + printf("%s", s); + continue; + } + sscanf(s, "\t{ %s %s %s }", beg, end, prop); + /*printf("==>\t{ \"%s\" \"%s\" \"%s\" },\n", beg, end, prop);*/ + if (lastprop[0] && strcmp(lastprop, prop) != 0) + { + printf("\t{ %s %s %s },\n", lastbeg, lastend, lastprop); + lastprop[0] = 0; + } + if (lastprop[0] == 0) + { + strcpy(lastbeg, beg); + strcpy(lastprop, prop); + } + strcpy(lastend, end); + } + if (lastprop[0]) + { + printf("\t{ %s %s %s },\n", lastbeg, lastend, prop); + } + return 0; +} diff --git a/linebreak/linebreak/libunibreak.pc.in b/linebreak/linebreak/libunibreak.pc.in new file mode 100644 index 0000000..e0509fc --- /dev/null +++ b/linebreak/linebreak/libunibreak.pc.in @@ -0,0 +1,11 @@ +libunibreak: +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ + +Name: libunibreak +Description: Library to implement Unicode algorithms for line and word breaking +Version: @VERSION@ +Libs: -L${libdir} -lunibreak +Cflags: -I${includedir} diff --git a/linebreak/linebreak/linebreak.c b/linebreak/linebreak/linebreak.c new file mode 100644 index 0000000..87a6d7d --- /dev/null +++ b/linebreak/linebreak/linebreak.c @@ -0,0 +1,737 @@ +/* vim: set tabstop=4 shiftwidth=4: */ + +/* + * Line breaking in a Unicode sequence. Designed to be used in a + * generic text renderer. + * + * Copyright (C) 2008-2011 Wu Yongwei + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute + * it freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must + * not claim that you wrote the original software. If you use this + * software in a product, an acknowledgement in the product + * documentation would be appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must + * not be misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source + * distribution. + * + * The main reference is Unicode Standard Annex 14 (UAX #14): + * + * + * When this library was designed, this annex was at Revision 19, for + * Unicode 5.0.0: + * + * + * This library has been updated according to Revision 26, for + * Unicode 6.0.0: + * + * + * The Unicode Terms of Use are available at + * + */ + +/** + * @file linebreak.c + * + * Implementation of the line breaking algorithm as described in Unicode + * Standard Annex 14. + * + * @version 2.1, 2011/05/07 + * @author Wu Yongwei + */ + +#include +#include +#include +#include "linebreak.h" +#include "linebreakdef.h" + +/** + * Size of the second-level index to the line breaking properties. + */ +#define LINEBREAK_INDEX_SIZE 40 + +/** + * Version number of the library. + */ +const int linebreak_version = LINEBREAK_VERSION; + +/** + * Enumeration of break actions. They are used in the break action + * pair table below. + */ +enum BreakAction +{ + DIR_BRK, /**< Direct break opportunity */ + IND_BRK, /**< Indirect break opportunity */ + CMI_BRK, /**< Indirect break opportunity for combining marks */ + CMP_BRK, /**< Prohibited break for combining marks */ + PRH_BRK /**< Prohibited break */ +}; + +/** + * Break action pair table. This is a direct mapping of Table 2 of + * Unicode Standard Annex 14, Revision 24. + */ +static enum BreakAction baTable[LBP_JT][LBP_JT] = { + { /* OP */ + PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, CMP_BRK, + PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK }, + { /* CL */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, + { /* CP */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, + DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, + { /* QU */ + PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, + IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK }, + { /* GL */ + IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, + IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK }, + { /* NS */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, + { /* EX */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, + { /* SY */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, + { /* IS */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, + DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, + { /* PR */ + IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK, + DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK }, + { /* PO */ + IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, + DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, + { /* NU */ + IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, + IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, + { /* AL */ + IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, + IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, + { /* ID */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, + { /* IN */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, + { /* HY */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, + { /* BA */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, + { /* BB */ + IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, + IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK }, + { /* B2 */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, + { /* ZW */ + DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK, DIR_BRK, + DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, + { /* CM */ + IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, + IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK }, + { /* WJ */ + IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, + IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK }, + { /* H2 */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK }, + { /* H3 */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK }, + { /* JL */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK }, + { /* JV */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK }, + { /* JT */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK, + PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK } +}; + +/** + * Struct for the second-level index to the line breaking properties. + */ +struct LineBreakPropertiesIndex +{ + utf32_t end; /**< End coding point */ + struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */ +}; + +/** + * Second-level index to the line breaking properties. + */ +static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] = +{ + { 0xFFFFFFFF, lb_prop_default } +}; + +/** + * Initializes the second-level index to the line breaking properties. + * If it is not called, the performance of #get_char_lb_class_lang (and + * thus the main functionality) can be pretty bad, especially for big + * code points like those of Chinese. + */ +void init_linebreak(void) +{ + size_t i; + size_t iPropDefault; + size_t len; + size_t step; + + len = 0; + while (lb_prop_default[len].prop != LBP_Undefined) + ++len; + step = len / LINEBREAK_INDEX_SIZE; + iPropDefault = 0; + for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i) + { + lb_prop_index[i].lbp = lb_prop_default + iPropDefault; + iPropDefault += step; + lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1; + } + lb_prop_index[--i].end = 0xFFFFFFFF; +} + +/** + * Gets the language-specific line breaking properties. + * + * @param lang language of the text + * @return pointer to the language-specific line breaking + * properties array if found; \c NULL otherwise + */ +static struct LineBreakProperties *get_lb_prop_lang(const char *lang) +{ + struct LineBreakPropertiesLang *lbplIter; + if (lang != NULL) + { + for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter) + { + if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0) + { + return lbplIter->lbp; + } + } + } + return NULL; +} + +/** + * Gets the line breaking class of a character from a line breaking + * properties array. + * + * @param ch character to check + * @param lbp pointer to the line breaking properties array + * @return the line breaking class if found; \c LBP_XX otherwise + */ +static enum LineBreakClass get_char_lb_class( + utf32_t ch, + struct LineBreakProperties *lbp) +{ + while (lbp->prop != LBP_Undefined && ch >= lbp->start) + { + if (ch <= lbp->end) + return lbp->prop; + ++lbp; + } + return LBP_XX; +} + +/** + * Gets the line breaking class of a character from the default line + * breaking properties array. + * + * @param ch character to check + * @return the line breaking class if found; \c LBP_XX otherwise + */ +static enum LineBreakClass get_char_lb_class_default( + utf32_t ch) +{ + size_t i = 0; + while (ch > lb_prop_index[i].end) + ++i; + assert(i < LINEBREAK_INDEX_SIZE); + return get_char_lb_class(ch, lb_prop_index[i].lbp); +} + +/** + * Gets the line breaking class of a character for a specific + * language. This function will check the language-specific data first, + * and then the default data if there is no language-specific property + * available for the character. + * + * @param ch character to check + * @param lbpLang pointer to the language-specific line breaking + * properties array + * @return the line breaking class if found; \c LBP_XX + * otherwise + */ +static enum LineBreakClass get_char_lb_class_lang( + utf32_t ch, + struct LineBreakProperties *lbpLang) +{ + enum LineBreakClass lbcResult; + + /* Find the language-specific line breaking class for a character */ + if (lbpLang) + { + lbcResult = get_char_lb_class(ch, lbpLang); + if (lbcResult != LBP_XX) + return lbcResult; + } + + /* Find the generic language-specific line breaking class, if no + * language context is provided, or language-specific data are not + * available for the specific character in the specified language */ + return get_char_lb_class_default(ch); +} + +/** + * Resolves the line breaking class for certain ambiguous or complicated + * characters. They are treated in a simplistic way in this + * implementation. + * + * @param lbc line breaking class to resolve + * @param lang language of the text + * @return the resolved line breaking class + */ +static enum LineBreakClass resolve_lb_class( + enum LineBreakClass lbc, + const char *lang) +{ + switch (lbc) + { + case LBP_AI: + if (lang != NULL && + (strncmp(lang, "zh", 2) == 0 || /* Chinese */ + strncmp(lang, "ja", 2) == 0 || /* Japanese */ + strncmp(lang, "ko", 2) == 0)) /* Korean */ + { + return LBP_ID; + } + /* Fall through */ + case LBP_SA: + case LBP_SG: + case LBP_XX: + return LBP_AL; + default: + return lbc; + } +} + +/** + * Gets the next Unicode character in a UTF-8 sequence. The index will + * be advanced to the next complete character, unless the end of string + * is reached in the middle of a UTF-8 sequence. + * + * @param[in] s input UTF-8 string + * @param[in] len length of the string in bytes + * @param[in,out] ip pointer to the index + * @return the Unicode character beginning at the index; or + * #EOS if end of input is encountered + */ +utf32_t lb_get_next_char_utf8( + const utf8_t *s, + size_t len, + size_t *ip) +{ + utf8_t ch; + utf32_t res; + + assert(*ip <= len); + if (*ip == len) + return EOS; + ch = s[*ip]; + + if (ch < 0xC2 || ch > 0xF4) + { /* One-byte sequence, tail (should not occur), or invalid */ + *ip += 1; + return ch; + } + else if (ch < 0xE0) + { /* Two-byte sequence */ + if (*ip + 2 > len) + return EOS; + res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F); + *ip += 2; + return res; + } + else if (ch < 0xF0) + { /* Three-byte sequence */ + if (*ip + 3 > len) + return EOS; + res = ((ch & 0x0F) << 12) + + ((s[*ip + 1] & 0x3F) << 6) + + ((s[*ip + 2] & 0x3F)); + *ip += 3; + return res; + } + else + { /* Four-byte sequence */ + if (*ip + 4 > len) + return EOS; + res = ((ch & 0x07) << 18) + + ((s[*ip + 1] & 0x3F) << 12) + + ((s[*ip + 2] & 0x3F) << 6) + + ((s[*ip + 3] & 0x3F)); + *ip += 4; + return res; + } +} + +/** + * Gets the next Unicode character in a UTF-16 sequence. The index will + * be advanced to the next complete character, unless the end of string + * is reached in the middle of a UTF-16 surrogate pair. + * + * @param[in] s input UTF-16 string + * @param[in] len length of the string in words + * @param[in,out] ip pointer to the index + * @return the Unicode character beginning at the index; or + * #EOS if end of input is encountered + */ +utf32_t lb_get_next_char_utf16( + const utf16_t *s, + size_t len, + size_t *ip) +{ + utf16_t ch; + + assert(*ip <= len); + if (*ip == len) + return EOS; + ch = s[(*ip)++]; + + if (ch < 0xD800 || ch > 0xDBFF) + { /* If the character is not a high surrogate */ + return ch; + } + if (*ip == len) + { /* If the input ends here (an error) */ + --(*ip); + return EOS; + } + if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF) + { /* If the next character is not the low surrogate (an error) */ + return ch; + } + /* Return the constructed character and advance the index again */ + return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000; +} + +/** + * Gets the next Unicode character in a UTF-32 sequence. The index will + * be advanced to the next character. + * + * @param[in] s input UTF-32 string + * @param[in] len length of the string in dwords + * @param[in,out] ip pointer to the index + * @return the Unicode character beginning at the index; or + * #EOS if end of input is encountered + */ +utf32_t lb_get_next_char_utf32( + const utf32_t *s, + size_t len, + size_t *ip) +{ + assert(*ip <= len); + if (*ip == len) + return EOS; + return s[(*ip)++]; +} + +/** + * Sets the line breaking information for a generic input string. + * + * @param[in] s input string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, + * containing #LINEBREAK_MUSTBREAK, + * #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK, + * or #LINEBREAK_INSIDEACHAR + * @param[in] get_next_char function to get the next UTF-32 character + */ +void set_linebreaks( + const void *s, + size_t len, + const char *lang, + char *brks, + get_next_char_t get_next_char) +{ + utf32_t ch; + enum LineBreakClass lbcCur; + enum LineBreakClass lbcNew; + enum LineBreakClass lbcLast; + struct LineBreakProperties *lbpLang; + size_t posCur = 0; + size_t posLast = 0; + + --posLast; /* To be ++'d later */ + ch = get_next_char(s, len, &posCur); + if (ch == EOS) + return; + lbpLang = get_lb_prop_lang(lang); + lbcCur = resolve_lb_class(get_char_lb_class_lang(ch, lbpLang), lang); + lbcNew = LBP_Undefined; + +nextline: + + /* Special treatment for the first character */ + switch (lbcCur) + { + case LBP_LF: + case LBP_NL: + lbcCur = LBP_BK; + break; + case LBP_CB: + lbcCur = LBP_BA; + break; + case LBP_SP: + lbcCur = LBP_WJ; + break; + default: + break; + } + + /* Process a line till an explicit break or end of string */ + for (;;) + { + for (++posLast; posLast < posCur - 1; ++posLast) + { + brks[posLast] = LINEBREAK_INSIDEACHAR; + } + assert(posLast == posCur - 1); + lbcLast = lbcNew; + ch = get_next_char(s, len, &posCur); + if (ch == EOS) + break; + lbcNew = get_char_lb_class_lang(ch, lbpLang); + if (lbcCur == LBP_BK || (lbcCur == LBP_CR && lbcNew != LBP_LF)) + { + brks[posLast] = LINEBREAK_MUSTBREAK; + lbcCur = resolve_lb_class(lbcNew, lang); + goto nextline; + } + + switch (lbcNew) + { + case LBP_SP: + brks[posLast] = LINEBREAK_NOBREAK; + continue; + case LBP_BK: + case LBP_LF: + case LBP_NL: + brks[posLast] = LINEBREAK_NOBREAK; + lbcCur = LBP_BK; + continue; + case LBP_CR: + brks[posLast] = LINEBREAK_NOBREAK; + lbcCur = LBP_CR; + continue; + case LBP_CB: + brks[posLast] = LINEBREAK_ALLOWBREAK; + lbcCur = LBP_BA; + continue; + default: + break; + } + + lbcNew = resolve_lb_class(lbcNew, lang); + + assert(lbcCur <= LBP_JT); + assert(lbcNew <= LBP_JT); + switch (baTable[lbcCur - 1][lbcNew - 1]) + { + case DIR_BRK: + brks[posLast] = LINEBREAK_ALLOWBREAK; + break; + case CMI_BRK: + case IND_BRK: + if (lbcLast == LBP_SP) + { + brks[posLast] = LINEBREAK_ALLOWBREAK; + } + else + { + brks[posLast] = LINEBREAK_NOBREAK; + } + break; + case CMP_BRK: + brks[posLast] = LINEBREAK_NOBREAK; + if (lbcLast != LBP_SP) + continue; + break; + case PRH_BRK: + brks[posLast] = LINEBREAK_NOBREAK; + break; + } + + lbcCur = lbcNew; + } + + assert(posLast == posCur - 1 && posCur <= len); + /* Break after the last character */ + brks[posLast] = LINEBREAK_MUSTBREAK; + /* When the input contains incomplete sequences */ + while (posCur < len) + { + brks[posCur++] = LINEBREAK_INSIDEACHAR; + } +} + +/** + * Sets the line breaking information for a UTF-8 input string. + * + * @param[in] s input UTF-8 string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK, + * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR + */ +void set_linebreaks_utf8( + const utf8_t *s, + size_t len, + const char *lang, + char *brks) +{ + set_linebreaks(s, len, lang, brks, + (get_next_char_t)lb_get_next_char_utf8); +} + +/** + * Sets the line breaking information for a UTF-16 input string. + * + * @param[in] s input UTF-16 string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK, + * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR + */ +void set_linebreaks_utf16( + const utf16_t *s, + size_t len, + const char *lang, + char *brks) +{ + set_linebreaks(s, len, lang, brks, + (get_next_char_t)lb_get_next_char_utf16); +} + +/** + * Sets the line breaking information for a UTF-32 input string. + * + * @param[in] s input UTF-32 string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK, + * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR + */ +void set_linebreaks_utf32( + const utf32_t *s, + size_t len, + const char *lang, + char *brks) +{ + set_linebreaks(s, len, lang, brks, + (get_next_char_t)lb_get_next_char_utf32); +} + +/** + * Tells whether a line break can occur between two Unicode characters. + * This is a wrapper function to expose a simple interface. Generally + * speaking, it is better to use #set_linebreaks_utf32 instead, since + * complicated cases involving combining marks, spaces, etc. cannot be + * correctly processed. + * + * @param char1 the first Unicode character + * @param char2 the second Unicode character + * @param lang language of the input + * @return one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK, + * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR + */ +int is_line_breakable( + utf32_t char1, + utf32_t char2, + const char* lang) +{ + utf32_t s[2]; + char brks[2]; + s[0] = char1; + s[1] = char2; + set_linebreaks_utf32(s, 2, lang, brks); + return brks[0]; +} diff --git a/linebreak/linebreak/linebreak.h b/linebreak/linebreak/linebreak.h new file mode 100644 index 0000000..e5250c0 --- /dev/null +++ b/linebreak/linebreak/linebreak.h @@ -0,0 +1,87 @@ +/* vim: set tabstop=4 shiftwidth=4: */ + +/* + * Line breaking in a Unicode sequence. Designed to be used in a + * generic text renderer. + * + * Copyright (C) 2008-2011 Wu Yongwei + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute + * it freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must + * not claim that you wrote the original software. If you use this + * software in a product, an acknowledgement in the product + * documentation would be appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must + * not be misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source + * distribution. + * + * The main reference is Unicode Standard Annex 14 (UAX #14): + * + * + * When this library was designed, this annex was at Revision 19, for + * Unicode 5.0.0: + * + * + * This library has been updated according to Revision 26, for + * Unicode 6.0.0: + * + * + * The Unicode Terms of Use are available at + * + */ + +/** + * @file linebreak.h + * + * Header file for the line breaking algorithm. + * + * @version 2.1, 2011/05/07 + * @author Wu Yongwei + */ + +#ifndef LINEBREAK_H +#define LINEBREAK_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define LINEBREAK_VERSION 0x0201 /**< Version of the library linebreak */ +extern const int linebreak_version; + +#ifndef LINEBREAK_UTF_TYPES_DEFINED +#define LINEBREAK_UTF_TYPES_DEFINED +typedef unsigned char utf8_t; /**< Type for UTF-8 data points */ +typedef unsigned short utf16_t; /**< Type for UTF-16 data points */ +typedef unsigned int utf32_t; /**< Type for UTF-32 data points */ +#endif + +#define LINEBREAK_MUSTBREAK 0 /**< Break is mandatory */ +#define LINEBREAK_ALLOWBREAK 1 /**< Break is allowed */ +#define LINEBREAK_NOBREAK 2 /**< No break is possible */ +#define LINEBREAK_INSIDEACHAR 3 /**< A UTF-8/16 sequence is unfinished */ + +void init_linebreak(void); +void set_linebreaks_utf8( + const utf8_t *s, size_t len, const char* lang, char *brks); +void set_linebreaks_utf16( + const utf16_t *s, size_t len, const char* lang, char *brks); +void set_linebreaks_utf32( + const utf32_t *s, size_t len, const char* lang, char *brks); +int is_line_breakable(utf32_t char1, utf32_t char2, const char* lang); + +#ifdef __cplusplus +} +#endif + +#endif /* LINEBREAK_H */ diff --git a/linebreak/linebreak/linebreakdata.c b/linebreak/linebreak/linebreakdata.c new file mode 100644 index 0000000..0021479 --- /dev/null +++ b/linebreak/linebreak/linebreakdata.c @@ -0,0 +1,1868 @@ +/* The content of this file is generated from: +# LineBreak-6.0.0.txt +# Date: 2010-08-18, 17:25:00 PDT [KW] +*/ + +#include "linebreak.h" +#include "linebreakdef.h" + +/** Default line breaking properties as from the Unicode Web site. */ +struct LineBreakProperties lb_prop_default[] = { + { 0x0000, 0x0008, LBP_CM }, + { 0x0009, 0x0009, LBP_BA }, + { 0x000A, 0x000A, LBP_LF }, + { 0x000B, 0x000C, LBP_BK }, + { 0x000D, 0x000D, LBP_CR }, + { 0x000E, 0x001F, LBP_CM }, + { 0x0020, 0x0020, LBP_SP }, + { 0x0021, 0x0021, LBP_EX }, + { 0x0022, 0x0022, LBP_QU }, + { 0x0023, 0x0023, LBP_AL }, + { 0x0024, 0x0024, LBP_PR }, + { 0x0025, 0x0025, LBP_PO }, + { 0x0026, 0x0026, LBP_AL }, + { 0x0027, 0x0027, LBP_QU }, + { 0x0028, 0x0028, LBP_OP }, + { 0x0029, 0x0029, LBP_CP }, + { 0x002A, 0x002A, LBP_AL }, + { 0x002B, 0x002B, LBP_PR }, + { 0x002C, 0x002C, LBP_IS }, + { 0x002D, 0x002D, LBP_HY }, + { 0x002E, 0x002E, LBP_IS }, + { 0x002F, 0x002F, LBP_SY }, + { 0x0030, 0x0039, LBP_NU }, + { 0x003A, 0x003B, LBP_IS }, + { 0x003C, 0x003E, LBP_AL }, + { 0x003F, 0x003F, LBP_EX }, + { 0x0040, 0x005A, LBP_AL }, + { 0x005B, 0x005B, LBP_OP }, + { 0x005C, 0x005C, LBP_PR }, + { 0x005D, 0x005D, LBP_CP }, + { 0x005E, 0x007A, LBP_AL }, + { 0x007B, 0x007B, LBP_OP }, + { 0x007C, 0x007C, LBP_BA }, + { 0x007D, 0x007D, LBP_CL }, + { 0x007E, 0x007E, LBP_AL }, + { 0x007F, 0x0084, LBP_CM }, + { 0x0085, 0x0085, LBP_NL }, + { 0x0086, 0x009F, LBP_CM }, + { 0x00A0, 0x00A0, LBP_GL }, + { 0x00A1, 0x00A1, LBP_OP }, + { 0x00A2, 0x00A2, LBP_PO }, + { 0x00A3, 0x00A5, LBP_PR }, + { 0x00A6, 0x00A6, LBP_AL }, + { 0x00A7, 0x00A8, LBP_AI }, + { 0x00A9, 0x00A9, LBP_AL }, + { 0x00AA, 0x00AA, LBP_AI }, + { 0x00AB, 0x00AB, LBP_QU }, + { 0x00AC, 0x00AC, LBP_AL }, + { 0x00AD, 0x00AD, LBP_BA }, + { 0x00AE, 0x00AF, LBP_AL }, + { 0x00B0, 0x00B0, LBP_PO }, + { 0x00B1, 0x00B1, LBP_PR }, + { 0x00B2, 0x00B3, LBP_AI }, + { 0x00B4, 0x00B4, LBP_BB }, + { 0x00B5, 0x00B5, LBP_AL }, + { 0x00B6, 0x00BA, LBP_AI }, + { 0x00BB, 0x00BB, LBP_QU }, + { 0x00BC, 0x00BE, LBP_AI }, + { 0x00BF, 0x00BF, LBP_OP }, + { 0x00C0, 0x00D6, LBP_AL }, + { 0x00D7, 0x00D7, LBP_AI }, + { 0x00D8, 0x00F6, LBP_AL }, + { 0x00F7, 0x00F7, LBP_AI }, + { 0x00F8, 0x02C6, LBP_AL }, + { 0x02C7, 0x02C7, LBP_AI }, + { 0x02C8, 0x02C8, LBP_BB }, + { 0x02C9, 0x02CB, LBP_AI }, + { 0x02CC, 0x02CC, LBP_BB }, + { 0x02CD, 0x02CD, LBP_AI }, + { 0x02CE, 0x02CF, LBP_AL }, + { 0x02D0, 0x02D0, LBP_AI }, + { 0x02D1, 0x02D7, LBP_AL }, + { 0x02D8, 0x02DB, LBP_AI }, + { 0x02DC, 0x02DC, LBP_AL }, + { 0x02DD, 0x02DD, LBP_AI }, + { 0x02DE, 0x02DE, LBP_AL }, + { 0x02DF, 0x02DF, LBP_BB }, + { 0x02E0, 0x02FF, LBP_AL }, + { 0x0300, 0x034E, LBP_CM }, + { 0x034F, 0x034F, LBP_GL }, + { 0x0350, 0x035B, LBP_CM }, + { 0x035C, 0x0362, LBP_GL }, + { 0x0363, 0x036F, LBP_CM }, + { 0x0370, 0x037D, LBP_AL }, + { 0x037E, 0x037E, LBP_IS }, + { 0x0384, 0x0482, LBP_AL }, + { 0x0483, 0x0489, LBP_CM }, + { 0x048A, 0x0587, LBP_AL }, + { 0x0589, 0x0589, LBP_IS }, + { 0x058A, 0x058A, LBP_BA }, + { 0x0591, 0x05BD, LBP_CM }, + { 0x05BE, 0x05BE, LBP_BA }, + { 0x05BF, 0x05BF, LBP_CM }, + { 0x05C0, 0x05C0, LBP_AL }, + { 0x05C1, 0x05C2, LBP_CM }, + { 0x05C3, 0x05C3, LBP_AL }, + { 0x05C4, 0x05C5, LBP_CM }, + { 0x05C6, 0x05C6, LBP_EX }, + { 0x05C7, 0x05C7, LBP_CM }, + { 0x05D0, 0x0608, LBP_AL }, + { 0x0609, 0x060B, LBP_PO }, + { 0x060C, 0x060D, LBP_IS }, + { 0x060E, 0x060F, LBP_AL }, + { 0x0610, 0x061A, LBP_CM }, + { 0x061B, 0x061F, LBP_EX }, + { 0x0620, 0x064A, LBP_AL }, + { 0x064B, 0x065F, LBP_CM }, + { 0x0660, 0x0669, LBP_NU }, + { 0x066A, 0x066A, LBP_PO }, + { 0x066B, 0x066C, LBP_NU }, + { 0x066D, 0x066F, LBP_AL }, + { 0x0670, 0x0670, LBP_CM }, + { 0x0671, 0x06D3, LBP_AL }, + { 0x06D4, 0x06D4, LBP_EX }, + { 0x06D5, 0x06D5, LBP_AL }, + { 0x06D6, 0x06DC, LBP_CM }, + { 0x06DD, 0x06DE, LBP_AL }, + { 0x06DF, 0x06E4, LBP_CM }, + { 0x06E5, 0x06E6, LBP_AL }, + { 0x06E7, 0x06E8, LBP_CM }, + { 0x06E9, 0x06E9, LBP_AL }, + { 0x06EA, 0x06ED, LBP_CM }, + { 0x06EE, 0x06EF, LBP_AL }, + { 0x06F0, 0x06F9, LBP_NU }, + { 0x06FA, 0x0710, LBP_AL }, + { 0x0711, 0x0711, LBP_CM }, + { 0x0712, 0x072F, LBP_AL }, + { 0x0730, 0x074A, LBP_CM }, + { 0x074D, 0x07A5, LBP_AL }, + { 0x07A6, 0x07B0, LBP_CM }, + { 0x07B1, 0x07B1, LBP_AL }, + { 0x07C0, 0x07C9, LBP_NU }, + { 0x07CA, 0x07EA, LBP_AL }, + { 0x07EB, 0x07F3, LBP_CM }, + { 0x07F4, 0x07F7, LBP_AL }, + { 0x07F8, 0x07F8, LBP_IS }, + { 0x07F9, 0x07F9, LBP_EX }, + { 0x07FA, 0x0815, LBP_AL }, + { 0x0816, 0x0819, LBP_CM }, + { 0x081A, 0x081A, LBP_AL }, + { 0x081B, 0x0823, LBP_CM }, + { 0x0824, 0x0824, LBP_AL }, + { 0x0825, 0x0827, LBP_CM }, + { 0x0828, 0x0828, LBP_AL }, + { 0x0829, 0x082D, LBP_CM }, + { 0x0830, 0x0858, LBP_AL }, + { 0x0859, 0x085B, LBP_CM }, + { 0x085E, 0x085E, LBP_AL }, + { 0x0900, 0x0903, LBP_CM }, + { 0x0904, 0x0939, LBP_AL }, + { 0x093A, 0x093C, LBP_CM }, + { 0x093D, 0x093D, LBP_AL }, + { 0x093E, 0x094F, LBP_CM }, + { 0x0950, 0x0950, LBP_AL }, + { 0x0951, 0x0957, LBP_CM }, + { 0x0958, 0x0961, LBP_AL }, + { 0x0962, 0x0963, LBP_CM }, + { 0x0964, 0x0965, LBP_BA }, + { 0x0966, 0x096F, LBP_NU }, + { 0x0970, 0x097F, LBP_AL }, + { 0x0981, 0x0983, LBP_CM }, + { 0x0985, 0x09B9, LBP_AL }, + { 0x09BC, 0x09BC, LBP_CM }, + { 0x09BD, 0x09BD, LBP_AL }, + { 0x09BE, 0x09CD, LBP_CM }, + { 0x09CE, 0x09CE, LBP_AL }, + { 0x09D7, 0x09D7, LBP_CM }, + { 0x09DC, 0x09E1, LBP_AL }, + { 0x09E2, 0x09E3, LBP_CM }, + { 0x09E6, 0x09EF, LBP_NU }, + { 0x09F0, 0x09F1, LBP_AL }, + { 0x09F2, 0x09F3, LBP_PO }, + { 0x09F4, 0x09F8, LBP_AL }, + { 0x09F9, 0x09F9, LBP_PO }, + { 0x09FA, 0x09FA, LBP_AL }, + { 0x09FB, 0x09FB, LBP_PR }, + { 0x0A01, 0x0A03, LBP_CM }, + { 0x0A05, 0x0A39, LBP_AL }, + { 0x0A3C, 0x0A51, LBP_CM }, + { 0x0A59, 0x0A5E, LBP_AL }, + { 0x0A66, 0x0A6F, LBP_NU }, + { 0x0A70, 0x0A71, LBP_CM }, + { 0x0A72, 0x0A74, LBP_AL }, + { 0x0A75, 0x0A83, LBP_CM }, + { 0x0A85, 0x0AB9, LBP_AL }, + { 0x0ABC, 0x0ABC, LBP_CM }, + { 0x0ABD, 0x0ABD, LBP_AL }, + { 0x0ABE, 0x0ACD, LBP_CM }, + { 0x0AD0, 0x0AE1, LBP_AL }, + { 0x0AE2, 0x0AE3, LBP_CM }, + { 0x0AE6, 0x0AEF, LBP_NU }, + { 0x0AF1, 0x0AF1, LBP_PR }, + { 0x0B01, 0x0B03, LBP_CM }, + { 0x0B05, 0x0B39, LBP_AL }, + { 0x0B3C, 0x0B3C, LBP_CM }, + { 0x0B3D, 0x0B3D, LBP_AL }, + { 0x0B3E, 0x0B57, LBP_CM }, + { 0x0B5C, 0x0B61, LBP_AL }, + { 0x0B62, 0x0B63, LBP_CM }, + { 0x0B66, 0x0B6F, LBP_NU }, + { 0x0B70, 0x0B77, LBP_AL }, + { 0x0B82, 0x0B82, LBP_CM }, + { 0x0B83, 0x0BB9, LBP_AL }, + { 0x0BBE, 0x0BCD, LBP_CM }, + { 0x0BD0, 0x0BD0, LBP_AL }, + { 0x0BD7, 0x0BD7, LBP_CM }, + { 0x0BE6, 0x0BEF, LBP_NU }, + { 0x0BF0, 0x0BF8, LBP_AL }, + { 0x0BF9, 0x0BF9, LBP_PR }, + { 0x0BFA, 0x0BFA, LBP_AL }, + { 0x0C01, 0x0C03, LBP_CM }, + { 0x0C05, 0x0C3D, LBP_AL }, + { 0x0C3E, 0x0C56, LBP_CM }, + { 0x0C58, 0x0C61, LBP_AL }, + { 0x0C62, 0x0C63, LBP_CM }, + { 0x0C66, 0x0C6F, LBP_NU }, + { 0x0C78, 0x0C7F, LBP_AL }, + { 0x0C82, 0x0C83, LBP_CM }, + { 0x0C85, 0x0CB9, LBP_AL }, + { 0x0CBC, 0x0CBC, LBP_CM }, + { 0x0CBD, 0x0CBD, LBP_AL }, + { 0x0CBE, 0x0CD6, LBP_CM }, + { 0x0CDE, 0x0CE1, LBP_AL }, + { 0x0CE2, 0x0CE3, LBP_CM }, + { 0x0CE6, 0x0CEF, LBP_NU }, + { 0x0CF1, 0x0CF2, LBP_AL }, + { 0x0D02, 0x0D03, LBP_CM }, + { 0x0D05, 0x0D3D, LBP_AL }, + { 0x0D3E, 0x0D4D, LBP_CM }, + { 0x0D4E, 0x0D4E, LBP_AL }, + { 0x0D57, 0x0D57, LBP_CM }, + { 0x0D60, 0x0D61, LBP_AL }, + { 0x0D62, 0x0D63, LBP_CM }, + { 0x0D66, 0x0D6F, LBP_NU }, + { 0x0D70, 0x0D75, LBP_AL }, + { 0x0D79, 0x0D79, LBP_PO }, + { 0x0D7A, 0x0D7F, LBP_AL }, + { 0x0D82, 0x0D83, LBP_CM }, + { 0x0D85, 0x0DC6, LBP_AL }, + { 0x0DCA, 0x0DF3, LBP_CM }, + { 0x0DF4, 0x0DF4, LBP_AL }, + { 0x0E01, 0x0E3A, LBP_SA }, + { 0x0E3F, 0x0E3F, LBP_PR }, + { 0x0E40, 0x0E4E, LBP_SA }, + { 0x0E4F, 0x0E4F, LBP_AL }, + { 0x0E50, 0x0E59, LBP_NU }, + { 0x0E5A, 0x0E5B, LBP_BA }, + { 0x0E81, 0x0ECD, LBP_SA }, + { 0x0ED0, 0x0ED9, LBP_NU }, + { 0x0EDC, 0x0EDD, LBP_SA }, + { 0x0F00, 0x0F00, LBP_AL }, + { 0x0F01, 0x0F04, LBP_BB }, + { 0x0F05, 0x0F05, LBP_AL }, + { 0x0F06, 0x0F07, LBP_BB }, + { 0x0F08, 0x0F08, LBP_GL }, + { 0x0F09, 0x0F0A, LBP_BB }, + { 0x0F0B, 0x0F0B, LBP_BA }, + { 0x0F0C, 0x0F0C, LBP_GL }, + { 0x0F0D, 0x0F11, LBP_EX }, + { 0x0F12, 0x0F12, LBP_GL }, + { 0x0F13, 0x0F13, LBP_AL }, + { 0x0F14, 0x0F14, LBP_EX }, + { 0x0F15, 0x0F17, LBP_AL }, + { 0x0F18, 0x0F19, LBP_CM }, + { 0x0F1A, 0x0F1F, LBP_AL }, + { 0x0F20, 0x0F29, LBP_NU }, + { 0x0F2A, 0x0F33, LBP_AL }, + { 0x0F34, 0x0F34, LBP_BA }, + { 0x0F35, 0x0F35, LBP_CM }, + { 0x0F36, 0x0F36, LBP_AL }, + { 0x0F37, 0x0F37, LBP_CM }, + { 0x0F38, 0x0F38, LBP_AL }, + { 0x0F39, 0x0F39, LBP_CM }, + { 0x0F3A, 0x0F3A, LBP_OP }, + { 0x0F3B, 0x0F3B, LBP_CL }, + { 0x0F3C, 0x0F3C, LBP_OP }, + { 0x0F3D, 0x0F3D, LBP_CL }, + { 0x0F3E, 0x0F3F, LBP_CM }, + { 0x0F40, 0x0F6C, LBP_AL }, + { 0x0F71, 0x0F7E, LBP_CM }, + { 0x0F7F, 0x0F7F, LBP_BA }, + { 0x0F80, 0x0F84, LBP_CM }, + { 0x0F85, 0x0F85, LBP_BA }, + { 0x0F86, 0x0F87, LBP_CM }, + { 0x0F88, 0x0F8C, LBP_AL }, + { 0x0F8D, 0x0FBC, LBP_CM }, + { 0x0FBE, 0x0FBF, LBP_BA }, + { 0x0FC0, 0x0FC5, LBP_AL }, + { 0x0FC6, 0x0FC6, LBP_CM }, + { 0x0FC7, 0x0FCF, LBP_AL }, + { 0x0FD0, 0x0FD1, LBP_BB }, + { 0x0FD2, 0x0FD2, LBP_BA }, + { 0x0FD3, 0x0FD3, LBP_BB }, + { 0x0FD4, 0x0FD8, LBP_AL }, + { 0x0FD9, 0x0FDA, LBP_GL }, + { 0x1000, 0x103F, LBP_SA }, + { 0x1040, 0x1049, LBP_NU }, + { 0x104A, 0x104B, LBP_BA }, + { 0x104C, 0x104F, LBP_AL }, + { 0x1050, 0x108F, LBP_SA }, + { 0x1090, 0x1099, LBP_NU }, + { 0x109A, 0x109F, LBP_SA }, + { 0x10A0, 0x10FC, LBP_AL }, + { 0x1100, 0x115F, LBP_JL }, + { 0x1160, 0x11A7, LBP_JV }, + { 0x11A8, 0x11FF, LBP_JT }, + { 0x1200, 0x135A, LBP_AL }, + { 0x135D, 0x135F, LBP_CM }, + { 0x1360, 0x1360, LBP_AL }, + { 0x1361, 0x1361, LBP_BA }, + { 0x1362, 0x13F4, LBP_AL }, + { 0x1400, 0x1400, LBP_BA }, + { 0x1401, 0x167F, LBP_AL }, + { 0x1680, 0x1680, LBP_BA }, + { 0x1681, 0x169A, LBP_AL }, + { 0x169B, 0x169B, LBP_OP }, + { 0x169C, 0x169C, LBP_CL }, + { 0x16A0, 0x16EA, LBP_AL }, + { 0x16EB, 0x16ED, LBP_BA }, + { 0x16EE, 0x1711, LBP_AL }, + { 0x1712, 0x1714, LBP_CM }, + { 0x1720, 0x1731, LBP_AL }, + { 0x1732, 0x1734, LBP_CM }, + { 0x1735, 0x1736, LBP_BA }, + { 0x1740, 0x1751, LBP_AL }, + { 0x1752, 0x1753, LBP_CM }, + { 0x1760, 0x1770, LBP_AL }, + { 0x1772, 0x1773, LBP_CM }, + { 0x1780, 0x17D3, LBP_SA }, + { 0x17D4, 0x17D5, LBP_BA }, + { 0x17D6, 0x17D6, LBP_NS }, + { 0x17D7, 0x17D7, LBP_SA }, + { 0x17D8, 0x17D8, LBP_BA }, + { 0x17D9, 0x17D9, LBP_AL }, + { 0x17DA, 0x17DA, LBP_BA }, + { 0x17DB, 0x17DB, LBP_PR }, + { 0x17DC, 0x17DD, LBP_SA }, + { 0x17E0, 0x17E9, LBP_NU }, + { 0x17F0, 0x1801, LBP_AL }, + { 0x1802, 0x1803, LBP_EX }, + { 0x1804, 0x1805, LBP_BA }, + { 0x1806, 0x1806, LBP_BB }, + { 0x1807, 0x1807, LBP_AL }, + { 0x1808, 0x1809, LBP_EX }, + { 0x180A, 0x180A, LBP_AL }, + { 0x180B, 0x180D, LBP_CM }, + { 0x180E, 0x180E, LBP_GL }, + { 0x1810, 0x1819, LBP_NU }, + { 0x1820, 0x18A8, LBP_AL }, + { 0x18A9, 0x18A9, LBP_CM }, + { 0x18AA, 0x191C, LBP_AL }, + { 0x1920, 0x193B, LBP_CM }, + { 0x1940, 0x1940, LBP_AL }, + { 0x1944, 0x1945, LBP_EX }, + { 0x1946, 0x194F, LBP_NU }, + { 0x1950, 0x19C9, LBP_SA }, + { 0x19D0, 0x19D9, LBP_NU }, + { 0x19DA, 0x19DF, LBP_SA }, + { 0x19E0, 0x1A16, LBP_AL }, + { 0x1A17, 0x1A1B, LBP_CM }, + { 0x1A1E, 0x1A1F, LBP_AL }, + { 0x1A20, 0x1A7C, LBP_SA }, + { 0x1A7F, 0x1A7F, LBP_CM }, + { 0x1A80, 0x1A99, LBP_NU }, + { 0x1AA0, 0x1AAD, LBP_SA }, + { 0x1B00, 0x1B04, LBP_CM }, + { 0x1B05, 0x1B33, LBP_AL }, + { 0x1B34, 0x1B44, LBP_CM }, + { 0x1B45, 0x1B4B, LBP_AL }, + { 0x1B50, 0x1B59, LBP_NU }, + { 0x1B5A, 0x1B5B, LBP_BA }, + { 0x1B5C, 0x1B5C, LBP_AL }, + { 0x1B5D, 0x1B60, LBP_BA }, + { 0x1B61, 0x1B6A, LBP_AL }, + { 0x1B6B, 0x1B73, LBP_CM }, + { 0x1B74, 0x1B7C, LBP_AL }, + { 0x1B80, 0x1B82, LBP_CM }, + { 0x1B83, 0x1BA0, LBP_AL }, + { 0x1BA1, 0x1BAA, LBP_CM }, + { 0x1BAE, 0x1BAF, LBP_AL }, + { 0x1BB0, 0x1BB9, LBP_NU }, + { 0x1BC0, 0x1BE5, LBP_AL }, + { 0x1BE6, 0x1BF3, LBP_CM }, + { 0x1BFC, 0x1C23, LBP_AL }, + { 0x1C24, 0x1C37, LBP_CM }, + { 0x1C3B, 0x1C3F, LBP_BA }, + { 0x1C40, 0x1C49, LBP_NU }, + { 0x1C4D, 0x1C4F, LBP_AL }, + { 0x1C50, 0x1C59, LBP_NU }, + { 0x1C5A, 0x1C7D, LBP_AL }, + { 0x1C7E, 0x1C7F, LBP_BA }, + { 0x1CD0, 0x1CD2, LBP_CM }, + { 0x1CD3, 0x1CD3, LBP_AL }, + { 0x1CD4, 0x1CE8, LBP_CM }, + { 0x1CE9, 0x1CEC, LBP_AL }, + { 0x1CED, 0x1CED, LBP_CM }, + { 0x1CEE, 0x1CF1, LBP_AL }, + { 0x1CF2, 0x1CF2, LBP_CM }, + { 0x1D00, 0x1DBF, LBP_AL }, + { 0x1DC0, 0x1DFF, LBP_CM }, + { 0x1E00, 0x1FFC, LBP_AL }, + { 0x1FFD, 0x1FFD, LBP_BB }, + { 0x1FFE, 0x1FFE, LBP_AL }, + { 0x2000, 0x2006, LBP_BA }, + { 0x2007, 0x2007, LBP_GL }, + { 0x2008, 0x200A, LBP_BA }, + { 0x200B, 0x200B, LBP_ZW }, + { 0x200C, 0x200F, LBP_CM }, + { 0x2010, 0x2010, LBP_BA }, + { 0x2011, 0x2011, LBP_GL }, + { 0x2012, 0x2013, LBP_BA }, + { 0x2014, 0x2014, LBP_B2 }, + { 0x2015, 0x2016, LBP_AI }, + { 0x2017, 0x2017, LBP_AL }, + { 0x2018, 0x2019, LBP_QU }, + { 0x201A, 0x201A, LBP_OP }, + { 0x201B, 0x201D, LBP_QU }, + { 0x201E, 0x201E, LBP_OP }, + { 0x201F, 0x201F, LBP_QU }, + { 0x2020, 0x2021, LBP_AI }, + { 0x2022, 0x2023, LBP_AL }, + { 0x2024, 0x2026, LBP_IN }, + { 0x2027, 0x2027, LBP_BA }, + { 0x2028, 0x2029, LBP_BK }, + { 0x202A, 0x202E, LBP_CM }, + { 0x202F, 0x202F, LBP_GL }, + { 0x2030, 0x2037, LBP_PO }, + { 0x2038, 0x2038, LBP_AL }, + { 0x2039, 0x203A, LBP_QU }, + { 0x203B, 0x203B, LBP_AI }, + { 0x203C, 0x203D, LBP_NS }, + { 0x203E, 0x2043, LBP_AL }, + { 0x2044, 0x2044, LBP_IS }, + { 0x2045, 0x2045, LBP_OP }, + { 0x2046, 0x2046, LBP_CL }, + { 0x2047, 0x2049, LBP_NS }, + { 0x204A, 0x2055, LBP_AL }, + { 0x2056, 0x2056, LBP_BA }, + { 0x2057, 0x2057, LBP_AL }, + { 0x2058, 0x205B, LBP_BA }, + { 0x205C, 0x205C, LBP_AL }, + { 0x205D, 0x205F, LBP_BA }, + { 0x2060, 0x2060, LBP_WJ }, + { 0x2061, 0x2064, LBP_AL }, + { 0x206A, 0x206F, LBP_CM }, + { 0x2070, 0x2071, LBP_AL }, + { 0x2074, 0x2074, LBP_AI }, + { 0x2075, 0x207C, LBP_AL }, + { 0x207D, 0x207D, LBP_OP }, + { 0x207E, 0x207E, LBP_CL }, + { 0x207F, 0x207F, LBP_AI }, + { 0x2080, 0x2080, LBP_AL }, + { 0x2081, 0x2084, LBP_AI }, + { 0x2085, 0x208C, LBP_AL }, + { 0x208D, 0x208D, LBP_OP }, + { 0x208E, 0x208E, LBP_CL }, + { 0x2090, 0x209C, LBP_AL }, + { 0x20A0, 0x20A6, LBP_PR }, + { 0x20A7, 0x20A7, LBP_PO }, + { 0x20A8, 0x20B5, LBP_PR }, + { 0x20B6, 0x20B6, LBP_PO }, + { 0x20B7, 0x20B9, LBP_PR }, + { 0x20D0, 0x20F0, LBP_CM }, + { 0x2100, 0x2102, LBP_AL }, + { 0x2103, 0x2103, LBP_PO }, + { 0x2104, 0x2104, LBP_AL }, + { 0x2105, 0x2105, LBP_AI }, + { 0x2106, 0x2108, LBP_AL }, + { 0x2109, 0x2109, LBP_PO }, + { 0x210A, 0x2112, LBP_AL }, + { 0x2113, 0x2113, LBP_AI }, + { 0x2114, 0x2115, LBP_AL }, + { 0x2116, 0x2116, LBP_PR }, + { 0x2117, 0x2120, LBP_AL }, + { 0x2121, 0x2122, LBP_AI }, + { 0x2123, 0x212A, LBP_AL }, + { 0x212B, 0x212B, LBP_AI }, + { 0x212C, 0x2153, LBP_AL }, + { 0x2154, 0x2155, LBP_AI }, + { 0x2156, 0x215A, LBP_AL }, + { 0x215B, 0x215B, LBP_AI }, + { 0x215C, 0x215D, LBP_AL }, + { 0x215E, 0x215E, LBP_AI }, + { 0x215F, 0x215F, LBP_AL }, + { 0x2160, 0x216B, LBP_AI }, + { 0x216C, 0x216F, LBP_AL }, + { 0x2170, 0x2179, LBP_AI }, + { 0x217A, 0x2188, LBP_AL }, + { 0x2189, 0x2199, LBP_AI }, + { 0x219A, 0x21D1, LBP_AL }, + { 0x21D2, 0x21D2, LBP_AI }, + { 0x21D3, 0x21D3, LBP_AL }, + { 0x21D4, 0x21D4, LBP_AI }, + { 0x21D5, 0x21FF, LBP_AL }, + { 0x2200, 0x2200, LBP_AI }, + { 0x2201, 0x2201, LBP_AL }, + { 0x2202, 0x2203, LBP_AI }, + { 0x2204, 0x2206, LBP_AL }, + { 0x2207, 0x2208, LBP_AI }, + { 0x2209, 0x220A, LBP_AL }, + { 0x220B, 0x220B, LBP_AI }, + { 0x220C, 0x220E, LBP_AL }, + { 0x220F, 0x220F, LBP_AI }, + { 0x2210, 0x2210, LBP_AL }, + { 0x2211, 0x2211, LBP_AI }, + { 0x2212, 0x2213, LBP_PR }, + { 0x2214, 0x2214, LBP_AL }, + { 0x2215, 0x2215, LBP_AI }, + { 0x2216, 0x2219, LBP_AL }, + { 0x221A, 0x221A, LBP_AI }, + { 0x221B, 0x221C, LBP_AL }, + { 0x221D, 0x2220, LBP_AI }, + { 0x2221, 0x2222, LBP_AL }, + { 0x2223, 0x2223, LBP_AI }, + { 0x2224, 0x2224, LBP_AL }, + { 0x2225, 0x2225, LBP_AI }, + { 0x2226, 0x2226, LBP_AL }, + { 0x2227, 0x222C, LBP_AI }, + { 0x222D, 0x222D, LBP_AL }, + { 0x222E, 0x222E, LBP_AI }, + { 0x222F, 0x2233, LBP_AL }, + { 0x2234, 0x2237, LBP_AI }, + { 0x2238, 0x223B, LBP_AL }, + { 0x223C, 0x223D, LBP_AI }, + { 0x223E, 0x2247, LBP_AL }, + { 0x2248, 0x2248, LBP_AI }, + { 0x2249, 0x224B, LBP_AL }, + { 0x224C, 0x224C, LBP_AI }, + { 0x224D, 0x2251, LBP_AL }, + { 0x2252, 0x2252, LBP_AI }, + { 0x2253, 0x225F, LBP_AL }, + { 0x2260, 0x2261, LBP_AI }, + { 0x2262, 0x2263, LBP_AL }, + { 0x2264, 0x2267, LBP_AI }, + { 0x2268, 0x2269, LBP_AL }, + { 0x226A, 0x226B, LBP_AI }, + { 0x226C, 0x226D, LBP_AL }, + { 0x226E, 0x226F, LBP_AI }, + { 0x2270, 0x2281, LBP_AL }, + { 0x2282, 0x2283, LBP_AI }, + { 0x2284, 0x2285, LBP_AL }, + { 0x2286, 0x2287, LBP_AI }, + { 0x2288, 0x2294, LBP_AL }, + { 0x2295, 0x2295, LBP_AI }, + { 0x2296, 0x2298, LBP_AL }, + { 0x2299, 0x2299, LBP_AI }, + { 0x229A, 0x22A4, LBP_AL }, + { 0x22A5, 0x22A5, LBP_AI }, + { 0x22A6, 0x22BE, LBP_AL }, + { 0x22BF, 0x22BF, LBP_AI }, + { 0x22C0, 0x2311, LBP_AL }, + { 0x2312, 0x2312, LBP_AI }, + { 0x2313, 0x2328, LBP_AL }, + { 0x2329, 0x2329, LBP_OP }, + { 0x232A, 0x232A, LBP_CL }, + { 0x232B, 0x244A, LBP_AL }, + { 0x2460, 0x24FE, LBP_AI }, + { 0x24FF, 0x24FF, LBP_AL }, + { 0x2500, 0x254B, LBP_AI }, + { 0x254C, 0x254F, LBP_AL }, + { 0x2550, 0x2574, LBP_AI }, + { 0x2575, 0x257F, LBP_AL }, + { 0x2580, 0x258F, LBP_AI }, + { 0x2590, 0x2591, LBP_AL }, + { 0x2592, 0x2595, LBP_AI }, + { 0x2596, 0x259F, LBP_AL }, + { 0x25A0, 0x25A1, LBP_AI }, + { 0x25A2, 0x25A2, LBP_AL }, + { 0x25A3, 0x25A9, LBP_AI }, + { 0x25AA, 0x25B1, LBP_AL }, + { 0x25B2, 0x25B3, LBP_AI }, + { 0x25B4, 0x25B5, LBP_AL }, + { 0x25B6, 0x25B7, LBP_AI }, + { 0x25B8, 0x25BB, LBP_AL }, + { 0x25BC, 0x25BD, LBP_AI }, + { 0x25BE, 0x25BF, LBP_AL }, + { 0x25C0, 0x25C1, LBP_AI }, + { 0x25C2, 0x25C5, LBP_AL }, + { 0x25C6, 0x25C8, LBP_AI }, + { 0x25C9, 0x25CA, LBP_AL }, + { 0x25CB, 0x25CB, LBP_AI }, + { 0x25CC, 0x25CD, LBP_AL }, + { 0x25CE, 0x25D1, LBP_AI }, + { 0x25D2, 0x25E1, LBP_AL }, + { 0x25E2, 0x25E5, LBP_AI }, + { 0x25E6, 0x25EE, LBP_AL }, + { 0x25EF, 0x25EF, LBP_AI }, + { 0x25F0, 0x2604, LBP_AL }, + { 0x2605, 0x2606, LBP_AI }, + { 0x2607, 0x2608, LBP_AL }, + { 0x2609, 0x2609, LBP_AI }, + { 0x260A, 0x260D, LBP_AL }, + { 0x260E, 0x260F, LBP_AI }, + { 0x2610, 0x2613, LBP_AL }, + { 0x2614, 0x2617, LBP_AI }, + { 0x2618, 0x261B, LBP_AL }, + { 0x261C, 0x261C, LBP_AI }, + { 0x261D, 0x261D, LBP_AL }, + { 0x261E, 0x261E, LBP_AI }, + { 0x261F, 0x263F, LBP_AL }, + { 0x2640, 0x2640, LBP_AI }, + { 0x2641, 0x2641, LBP_AL }, + { 0x2642, 0x2642, LBP_AI }, + { 0x2643, 0x265F, LBP_AL }, + { 0x2660, 0x2661, LBP_AI }, + { 0x2662, 0x2662, LBP_AL }, + { 0x2663, 0x2665, LBP_AI }, + { 0x2666, 0x2666, LBP_AL }, + { 0x2667, 0x266A, LBP_AI }, + { 0x266B, 0x266B, LBP_AL }, + { 0x266C, 0x266D, LBP_AI }, + { 0x266E, 0x266E, LBP_AL }, + { 0x266F, 0x266F, LBP_AI }, + { 0x2670, 0x269D, LBP_AL }, + { 0x269E, 0x269F, LBP_AI }, + { 0x26A0, 0x26BD, LBP_AL }, + { 0x26BE, 0x26BF, LBP_AI }, + { 0x26C0, 0x26C3, LBP_AL }, + { 0x26C4, 0x26CD, LBP_AI }, + { 0x26CE, 0x26CE, LBP_AL }, + { 0x26CF, 0x26E1, LBP_AI }, + { 0x26E2, 0x26E2, LBP_AL }, + { 0x26E3, 0x26E3, LBP_AI }, + { 0x26E4, 0x26E7, LBP_AL }, + { 0x26E8, 0x26FF, LBP_AI }, + { 0x2701, 0x2756, LBP_AL }, + { 0x2757, 0x2757, LBP_AI }, + { 0x2758, 0x275A, LBP_AL }, + { 0x275B, 0x275E, LBP_QU }, + { 0x275F, 0x2761, LBP_AL }, + { 0x2762, 0x2763, LBP_EX }, + { 0x2764, 0x2767, LBP_AL }, + { 0x2768, 0x2768, LBP_OP }, + { 0x2769, 0x2769, LBP_CL }, + { 0x276A, 0x276A, LBP_OP }, + { 0x276B, 0x276B, LBP_CL }, + { 0x276C, 0x276C, LBP_OP }, + { 0x276D, 0x276D, LBP_CL }, + { 0x276E, 0x276E, LBP_OP }, + { 0x276F, 0x276F, LBP_CL }, + { 0x2770, 0x2770, LBP_OP }, + { 0x2771, 0x2771, LBP_CL }, + { 0x2772, 0x2772, LBP_OP }, + { 0x2773, 0x2773, LBP_CL }, + { 0x2774, 0x2774, LBP_OP }, + { 0x2775, 0x2775, LBP_CL }, + { 0x2776, 0x2793, LBP_AI }, + { 0x2794, 0x27C4, LBP_AL }, + { 0x27C5, 0x27C5, LBP_OP }, + { 0x27C6, 0x27C6, LBP_CL }, + { 0x27C7, 0x27E5, LBP_AL }, + { 0x27E6, 0x27E6, LBP_OP }, + { 0x27E7, 0x27E7, LBP_CL }, + { 0x27E8, 0x27E8, LBP_OP }, + { 0x27E9, 0x27E9, LBP_CL }, + { 0x27EA, 0x27EA, LBP_OP }, + { 0x27EB, 0x27EB, LBP_CL }, + { 0x27EC, 0x27EC, LBP_OP }, + { 0x27ED, 0x27ED, LBP_CL }, + { 0x27EE, 0x27EE, LBP_OP }, + { 0x27EF, 0x27EF, LBP_CL }, + { 0x27F0, 0x2982, LBP_AL }, + { 0x2983, 0x2983, LBP_OP }, + { 0x2984, 0x2984, LBP_CL }, + { 0x2985, 0x2985, LBP_OP }, + { 0x2986, 0x2986, LBP_CL }, + { 0x2987, 0x2987, LBP_OP }, + { 0x2988, 0x2988, LBP_CL }, + { 0x2989, 0x2989, LBP_OP }, + { 0x298A, 0x298A, LBP_CL }, + { 0x298B, 0x298B, LBP_OP }, + { 0x298C, 0x298C, LBP_CL }, + { 0x298D, 0x298D, LBP_OP }, + { 0x298E, 0x298E, LBP_CL }, + { 0x298F, 0x298F, LBP_OP }, + { 0x2990, 0x2990, LBP_CL }, + { 0x2991, 0x2991, LBP_OP }, + { 0x2992, 0x2992, LBP_CL }, + { 0x2993, 0x2993, LBP_OP }, + { 0x2994, 0x2994, LBP_CL }, + { 0x2995, 0x2995, LBP_OP }, + { 0x2996, 0x2996, LBP_CL }, + { 0x2997, 0x2997, LBP_OP }, + { 0x2998, 0x2998, LBP_CL }, + { 0x2999, 0x29D7, LBP_AL }, + { 0x29D8, 0x29D8, LBP_OP }, + { 0x29D9, 0x29D9, LBP_CL }, + { 0x29DA, 0x29DA, LBP_OP }, + { 0x29DB, 0x29DB, LBP_CL }, + { 0x29DC, 0x29FB, LBP_AL }, + { 0x29FC, 0x29FC, LBP_OP }, + { 0x29FD, 0x29FD, LBP_CL }, + { 0x29FE, 0x2B54, LBP_AL }, + { 0x2B55, 0x2B59, LBP_AI }, + { 0x2C00, 0x2CEE, LBP_AL }, + { 0x2CEF, 0x2CF1, LBP_CM }, + { 0x2CF9, 0x2CF9, LBP_EX }, + { 0x2CFA, 0x2CFC, LBP_BA }, + { 0x2CFD, 0x2CFD, LBP_AL }, + { 0x2CFE, 0x2CFE, LBP_EX }, + { 0x2CFF, 0x2CFF, LBP_BA }, + { 0x2D00, 0x2D6F, LBP_AL }, + { 0x2D70, 0x2D70, LBP_BA }, + { 0x2D7F, 0x2D7F, LBP_CM }, + { 0x2D80, 0x2DDE, LBP_AL }, + { 0x2DE0, 0x2DFF, LBP_CM }, + { 0x2E00, 0x2E0D, LBP_QU }, + { 0x2E0E, 0x2E15, LBP_BA }, + { 0x2E16, 0x2E16, LBP_AL }, + { 0x2E17, 0x2E17, LBP_BA }, + { 0x2E18, 0x2E18, LBP_OP }, + { 0x2E19, 0x2E19, LBP_BA }, + { 0x2E1A, 0x2E1B, LBP_AL }, + { 0x2E1C, 0x2E1D, LBP_QU }, + { 0x2E1E, 0x2E1F, LBP_AL }, + { 0x2E20, 0x2E21, LBP_QU }, + { 0x2E22, 0x2E22, LBP_OP }, + { 0x2E23, 0x2E23, LBP_CL }, + { 0x2E24, 0x2E24, LBP_OP }, + { 0x2E25, 0x2E25, LBP_CL }, + { 0x2E26, 0x2E26, LBP_OP }, + { 0x2E27, 0x2E27, LBP_CL }, + { 0x2E28, 0x2E28, LBP_OP }, + { 0x2E29, 0x2E29, LBP_CL }, + { 0x2E2A, 0x2E2D, LBP_BA }, + { 0x2E2E, 0x2E2E, LBP_EX }, + { 0x2E2F, 0x2E2F, LBP_AL }, + { 0x2E30, 0x2E31, LBP_BA }, + { 0x2E80, 0x3000, LBP_ID }, + { 0x3001, 0x3002, LBP_CL }, + { 0x3003, 0x3004, LBP_ID }, + { 0x3005, 0x3005, LBP_NS }, + { 0x3006, 0x3007, LBP_ID }, + { 0x3008, 0x3008, LBP_OP }, + { 0x3009, 0x3009, LBP_CL }, + { 0x300A, 0x300A, LBP_OP }, + { 0x300B, 0x300B, LBP_CL }, + { 0x300C, 0x300C, LBP_OP }, + { 0x300D, 0x300D, LBP_CL }, + { 0x300E, 0x300E, LBP_OP }, + { 0x300F, 0x300F, LBP_CL }, + { 0x3010, 0x3010, LBP_OP }, + { 0x3011, 0x3011, LBP_CL }, + { 0x3012, 0x3013, LBP_ID }, + { 0x3014, 0x3014, LBP_OP }, + { 0x3015, 0x3015, LBP_CL }, + { 0x3016, 0x3016, LBP_OP }, + { 0x3017, 0x3017, LBP_CL }, + { 0x3018, 0x3018, LBP_OP }, + { 0x3019, 0x3019, LBP_CL }, + { 0x301A, 0x301A, LBP_OP }, + { 0x301B, 0x301B, LBP_CL }, + { 0x301C, 0x301C, LBP_NS }, + { 0x301D, 0x301D, LBP_OP }, + { 0x301E, 0x301F, LBP_CL }, + { 0x3020, 0x3029, LBP_ID }, + { 0x302A, 0x302F, LBP_CM }, + { 0x3030, 0x303A, LBP_ID }, + { 0x303B, 0x303C, LBP_NS }, + { 0x303D, 0x303F, LBP_ID }, + { 0x3041, 0x3041, LBP_NS }, + { 0x3042, 0x3042, LBP_ID }, + { 0x3043, 0x3043, LBP_NS }, + { 0x3044, 0x3044, LBP_ID }, + { 0x3045, 0x3045, LBP_NS }, + { 0x3046, 0x3046, LBP_ID }, + { 0x3047, 0x3047, LBP_NS }, + { 0x3048, 0x3048, LBP_ID }, + { 0x3049, 0x3049, LBP_NS }, + { 0x304A, 0x3062, LBP_ID }, + { 0x3063, 0x3063, LBP_NS }, + { 0x3064, 0x3082, LBP_ID }, + { 0x3083, 0x3083, LBP_NS }, + { 0x3084, 0x3084, LBP_ID }, + { 0x3085, 0x3085, LBP_NS }, + { 0x3086, 0x3086, LBP_ID }, + { 0x3087, 0x3087, LBP_NS }, + { 0x3088, 0x308D, LBP_ID }, + { 0x308E, 0x308E, LBP_NS }, + { 0x308F, 0x3094, LBP_ID }, + { 0x3095, 0x3096, LBP_NS }, + { 0x3099, 0x309A, LBP_CM }, + { 0x309B, 0x309E, LBP_NS }, + { 0x309F, 0x309F, LBP_ID }, + { 0x30A0, 0x30A1, LBP_NS }, + { 0x30A2, 0x30A2, LBP_ID }, + { 0x30A3, 0x30A3, LBP_NS }, + { 0x30A4, 0x30A4, LBP_ID }, + { 0x30A5, 0x30A5, LBP_NS }, + { 0x30A6, 0x30A6, LBP_ID }, + { 0x30A7, 0x30A7, LBP_NS }, + { 0x30A8, 0x30A8, LBP_ID }, + { 0x30A9, 0x30A9, LBP_NS }, + { 0x30AA, 0x30C2, LBP_ID }, + { 0x30C3, 0x30C3, LBP_NS }, + { 0x30C4, 0x30E2, LBP_ID }, + { 0x30E3, 0x30E3, LBP_NS }, + { 0x30E4, 0x30E4, LBP_ID }, + { 0x30E5, 0x30E5, LBP_NS }, + { 0x30E6, 0x30E6, LBP_ID }, + { 0x30E7, 0x30E7, LBP_NS }, + { 0x30E8, 0x30ED, LBP_ID }, + { 0x30EE, 0x30EE, LBP_NS }, + { 0x30EF, 0x30F4, LBP_ID }, + { 0x30F5, 0x30F6, LBP_NS }, + { 0x30F7, 0x30FA, LBP_ID }, + { 0x30FB, 0x30FE, LBP_NS }, + { 0x30FF, 0x31E3, LBP_ID }, + { 0x31F0, 0x31FF, LBP_NS }, + { 0x3200, 0x3247, LBP_ID }, + { 0x3248, 0x324F, LBP_AI }, + { 0x3250, 0x4DBF, LBP_ID }, + { 0x4DC0, 0x4DFF, LBP_AL }, + { 0x4E00, 0xA014, LBP_ID }, + { 0xA015, 0xA015, LBP_NS }, + { 0xA016, 0xA4C6, LBP_ID }, + { 0xA4D0, 0xA4FD, LBP_AL }, + { 0xA4FE, 0xA4FF, LBP_BA }, + { 0xA500, 0xA60C, LBP_AL }, + { 0xA60D, 0xA60D, LBP_BA }, + { 0xA60E, 0xA60E, LBP_EX }, + { 0xA60F, 0xA60F, LBP_BA }, + { 0xA610, 0xA61F, LBP_AL }, + { 0xA620, 0xA629, LBP_NU }, + { 0xA62A, 0xA66E, LBP_AL }, + { 0xA66F, 0xA672, LBP_CM }, + { 0xA673, 0xA673, LBP_AL }, + { 0xA67C, 0xA67D, LBP_CM }, + { 0xA67E, 0xA6EF, LBP_AL }, + { 0xA6F0, 0xA6F1, LBP_CM }, + { 0xA6F2, 0xA6F2, LBP_AL }, + { 0xA6F3, 0xA6F7, LBP_BA }, + { 0xA700, 0xA801, LBP_AL }, + { 0xA802, 0xA802, LBP_CM }, + { 0xA803, 0xA805, LBP_AL }, + { 0xA806, 0xA806, LBP_CM }, + { 0xA807, 0xA80A, LBP_AL }, + { 0xA80B, 0xA80B, LBP_CM }, + { 0xA80C, 0xA822, LBP_AL }, + { 0xA823, 0xA827, LBP_CM }, + { 0xA828, 0xA837, LBP_AL }, + { 0xA838, 0xA838, LBP_PO }, + { 0xA839, 0xA873, LBP_AL }, + { 0xA874, 0xA875, LBP_BB }, + { 0xA876, 0xA877, LBP_EX }, + { 0xA880, 0xA881, LBP_CM }, + { 0xA882, 0xA8B3, LBP_AL }, + { 0xA8B4, 0xA8C4, LBP_CM }, + { 0xA8CE, 0xA8CF, LBP_BA }, + { 0xA8D0, 0xA8D9, LBP_NU }, + { 0xA8E0, 0xA8F1, LBP_CM }, + { 0xA8F2, 0xA8FB, LBP_AL }, + { 0xA900, 0xA909, LBP_NU }, + { 0xA90A, 0xA925, LBP_AL }, + { 0xA926, 0xA92D, LBP_CM }, + { 0xA92E, 0xA92F, LBP_BA }, + { 0xA930, 0xA946, LBP_AL }, + { 0xA947, 0xA953, LBP_CM }, + { 0xA95F, 0xA95F, LBP_AL }, + { 0xA960, 0xA97C, LBP_JL }, + { 0xA980, 0xA983, LBP_CM }, + { 0xA984, 0xA9B2, LBP_AL }, + { 0xA9B3, 0xA9C0, LBP_CM }, + { 0xA9C1, 0xA9C6, LBP_AL }, + { 0xA9C7, 0xA9C9, LBP_BA }, + { 0xA9CA, 0xA9CF, LBP_AL }, + { 0xA9D0, 0xA9D9, LBP_NU }, + { 0xA9DE, 0xAA28, LBP_AL }, + { 0xAA29, 0xAA36, LBP_CM }, + { 0xAA40, 0xAA42, LBP_AL }, + { 0xAA43, 0xAA43, LBP_CM }, + { 0xAA44, 0xAA4B, LBP_AL }, + { 0xAA4C, 0xAA4D, LBP_CM }, + { 0xAA50, 0xAA59, LBP_NU }, + { 0xAA5C, 0xAA5C, LBP_AL }, + { 0xAA5D, 0xAA5F, LBP_BA }, + { 0xAA60, 0xAADF, LBP_SA }, + { 0xAB01, 0xABE2, LBP_AL }, + { 0xABE3, 0xABEA, LBP_CM }, + { 0xABEB, 0xABEB, LBP_BA }, + { 0xABEC, 0xABED, LBP_CM }, + { 0xABF0, 0xABF9, LBP_NU }, + { 0xAC00, 0xAC00, LBP_H2 }, + { 0xAC01, 0xAC1B, LBP_H3 }, + { 0xAC1C, 0xAC1C, LBP_H2 }, + { 0xAC1D, 0xAC37, LBP_H3 }, + { 0xAC38, 0xAC38, LBP_H2 }, + { 0xAC39, 0xAC53, LBP_H3 }, + { 0xAC54, 0xAC54, LBP_H2 }, + { 0xAC55, 0xAC6F, LBP_H3 }, + { 0xAC70, 0xAC70, LBP_H2 }, + { 0xAC71, 0xAC8B, LBP_H3 }, + { 0xAC8C, 0xAC8C, LBP_H2 }, + { 0xAC8D, 0xACA7, LBP_H3 }, + { 0xACA8, 0xACA8, LBP_H2 }, + { 0xACA9, 0xACC3, LBP_H3 }, + { 0xACC4, 0xACC4, LBP_H2 }, + { 0xACC5, 0xACDF, LBP_H3 }, + { 0xACE0, 0xACE0, LBP_H2 }, + { 0xACE1, 0xACFB, LBP_H3 }, + { 0xACFC, 0xACFC, LBP_H2 }, + { 0xACFD, 0xAD17, LBP_H3 }, + { 0xAD18, 0xAD18, LBP_H2 }, + { 0xAD19, 0xAD33, LBP_H3 }, + { 0xAD34, 0xAD34, LBP_H2 }, + { 0xAD35, 0xAD4F, LBP_H3 }, + { 0xAD50, 0xAD50, LBP_H2 }, + { 0xAD51, 0xAD6B, LBP_H3 }, + { 0xAD6C, 0xAD6C, LBP_H2 }, + { 0xAD6D, 0xAD87, LBP_H3 }, + { 0xAD88, 0xAD88, LBP_H2 }, + { 0xAD89, 0xADA3, LBP_H3 }, + { 0xADA4, 0xADA4, LBP_H2 }, + { 0xADA5, 0xADBF, LBP_H3 }, + { 0xADC0, 0xADC0, LBP_H2 }, + { 0xADC1, 0xADDB, LBP_H3 }, + { 0xADDC, 0xADDC, LBP_H2 }, + { 0xADDD, 0xADF7, LBP_H3 }, + { 0xADF8, 0xADF8, LBP_H2 }, + { 0xADF9, 0xAE13, LBP_H3 }, + { 0xAE14, 0xAE14, LBP_H2 }, + { 0xAE15, 0xAE2F, LBP_H3 }, + { 0xAE30, 0xAE30, LBP_H2 }, + { 0xAE31, 0xAE4B, LBP_H3 }, + { 0xAE4C, 0xAE4C, LBP_H2 }, + { 0xAE4D, 0xAE67, LBP_H3 }, + { 0xAE68, 0xAE68, LBP_H2 }, + { 0xAE69, 0xAE83, LBP_H3 }, + { 0xAE84, 0xAE84, LBP_H2 }, + { 0xAE85, 0xAE9F, LBP_H3 }, + { 0xAEA0, 0xAEA0, LBP_H2 }, + { 0xAEA1, 0xAEBB, LBP_H3 }, + { 0xAEBC, 0xAEBC, LBP_H2 }, + { 0xAEBD, 0xAED7, LBP_H3 }, + { 0xAED8, 0xAED8, LBP_H2 }, + { 0xAED9, 0xAEF3, LBP_H3 }, + { 0xAEF4, 0xAEF4, LBP_H2 }, + { 0xAEF5, 0xAF0F, LBP_H3 }, + { 0xAF10, 0xAF10, LBP_H2 }, + { 0xAF11, 0xAF2B, LBP_H3 }, + { 0xAF2C, 0xAF2C, LBP_H2 }, + { 0xAF2D, 0xAF47, LBP_H3 }, + { 0xAF48, 0xAF48, LBP_H2 }, + { 0xAF49, 0xAF63, LBP_H3 }, + { 0xAF64, 0xAF64, LBP_H2 }, + { 0xAF65, 0xAF7F, LBP_H3 }, + { 0xAF80, 0xAF80, LBP_H2 }, + { 0xAF81, 0xAF9B, LBP_H3 }, + { 0xAF9C, 0xAF9C, LBP_H2 }, + { 0xAF9D, 0xAFB7, LBP_H3 }, + { 0xAFB8, 0xAFB8, LBP_H2 }, + { 0xAFB9, 0xAFD3, LBP_H3 }, + { 0xAFD4, 0xAFD4, LBP_H2 }, + { 0xAFD5, 0xAFEF, LBP_H3 }, + { 0xAFF0, 0xAFF0, LBP_H2 }, + { 0xAFF1, 0xB00B, LBP_H3 }, + { 0xB00C, 0xB00C, LBP_H2 }, + { 0xB00D, 0xB027, LBP_H3 }, + { 0xB028, 0xB028, LBP_H2 }, + { 0xB029, 0xB043, LBP_H3 }, + { 0xB044, 0xB044, LBP_H2 }, + { 0xB045, 0xB05F, LBP_H3 }, + { 0xB060, 0xB060, LBP_H2 }, + { 0xB061, 0xB07B, LBP_H3 }, + { 0xB07C, 0xB07C, LBP_H2 }, + { 0xB07D, 0xB097, LBP_H3 }, + { 0xB098, 0xB098, LBP_H2 }, + { 0xB099, 0xB0B3, LBP_H3 }, + { 0xB0B4, 0xB0B4, LBP_H2 }, + { 0xB0B5, 0xB0CF, LBP_H3 }, + { 0xB0D0, 0xB0D0, LBP_H2 }, + { 0xB0D1, 0xB0EB, LBP_H3 }, + { 0xB0EC, 0xB0EC, LBP_H2 }, + { 0xB0ED, 0xB107, LBP_H3 }, + { 0xB108, 0xB108, LBP_H2 }, + { 0xB109, 0xB123, LBP_H3 }, + { 0xB124, 0xB124, LBP_H2 }, + { 0xB125, 0xB13F, LBP_H3 }, + { 0xB140, 0xB140, LBP_H2 }, + { 0xB141, 0xB15B, LBP_H3 }, + { 0xB15C, 0xB15C, LBP_H2 }, + { 0xB15D, 0xB177, LBP_H3 }, + { 0xB178, 0xB178, LBP_H2 }, + { 0xB179, 0xB193, LBP_H3 }, + { 0xB194, 0xB194, LBP_H2 }, + { 0xB195, 0xB1AF, LBP_H3 }, + { 0xB1B0, 0xB1B0, LBP_H2 }, + { 0xB1B1, 0xB1CB, LBP_H3 }, + { 0xB1CC, 0xB1CC, LBP_H2 }, + { 0xB1CD, 0xB1E7, LBP_H3 }, + { 0xB1E8, 0xB1E8, LBP_H2 }, + { 0xB1E9, 0xB203, LBP_H3 }, + { 0xB204, 0xB204, LBP_H2 }, + { 0xB205, 0xB21F, LBP_H3 }, + { 0xB220, 0xB220, LBP_H2 }, + { 0xB221, 0xB23B, LBP_H3 }, + { 0xB23C, 0xB23C, LBP_H2 }, + { 0xB23D, 0xB257, LBP_H3 }, + { 0xB258, 0xB258, LBP_H2 }, + { 0xB259, 0xB273, LBP_H3 }, + { 0xB274, 0xB274, LBP_H2 }, + { 0xB275, 0xB28F, LBP_H3 }, + { 0xB290, 0xB290, LBP_H2 }, + { 0xB291, 0xB2AB, LBP_H3 }, + { 0xB2AC, 0xB2AC, LBP_H2 }, + { 0xB2AD, 0xB2C7, LBP_H3 }, + { 0xB2C8, 0xB2C8, LBP_H2 }, + { 0xB2C9, 0xB2E3, LBP_H3 }, + { 0xB2E4, 0xB2E4, LBP_H2 }, + { 0xB2E5, 0xB2FF, LBP_H3 }, + { 0xB300, 0xB300, LBP_H2 }, + { 0xB301, 0xB31B, LBP_H3 }, + { 0xB31C, 0xB31C, LBP_H2 }, + { 0xB31D, 0xB337, LBP_H3 }, + { 0xB338, 0xB338, LBP_H2 }, + { 0xB339, 0xB353, LBP_H3 }, + { 0xB354, 0xB354, LBP_H2 }, + { 0xB355, 0xB36F, LBP_H3 }, + { 0xB370, 0xB370, LBP_H2 }, + { 0xB371, 0xB38B, LBP_H3 }, + { 0xB38C, 0xB38C, LBP_H2 }, + { 0xB38D, 0xB3A7, LBP_H3 }, + { 0xB3A8, 0xB3A8, LBP_H2 }, + { 0xB3A9, 0xB3C3, LBP_H3 }, + { 0xB3C4, 0xB3C4, LBP_H2 }, + { 0xB3C5, 0xB3DF, LBP_H3 }, + { 0xB3E0, 0xB3E0, LBP_H2 }, + { 0xB3E1, 0xB3FB, LBP_H3 }, + { 0xB3FC, 0xB3FC, LBP_H2 }, + { 0xB3FD, 0xB417, LBP_H3 }, + { 0xB418, 0xB418, LBP_H2 }, + { 0xB419, 0xB433, LBP_H3 }, + { 0xB434, 0xB434, LBP_H2 }, + { 0xB435, 0xB44F, LBP_H3 }, + { 0xB450, 0xB450, LBP_H2 }, + { 0xB451, 0xB46B, LBP_H3 }, + { 0xB46C, 0xB46C, LBP_H2 }, + { 0xB46D, 0xB487, LBP_H3 }, + { 0xB488, 0xB488, LBP_H2 }, + { 0xB489, 0xB4A3, LBP_H3 }, + { 0xB4A4, 0xB4A4, LBP_H2 }, + { 0xB4A5, 0xB4BF, LBP_H3 }, + { 0xB4C0, 0xB4C0, LBP_H2 }, + { 0xB4C1, 0xB4DB, LBP_H3 }, + { 0xB4DC, 0xB4DC, LBP_H2 }, + { 0xB4DD, 0xB4F7, LBP_H3 }, + { 0xB4F8, 0xB4F8, LBP_H2 }, + { 0xB4F9, 0xB513, LBP_H3 }, + { 0xB514, 0xB514, LBP_H2 }, + { 0xB515, 0xB52F, LBP_H3 }, + { 0xB530, 0xB530, LBP_H2 }, + { 0xB531, 0xB54B, LBP_H3 }, + { 0xB54C, 0xB54C, LBP_H2 }, + { 0xB54D, 0xB567, LBP_H3 }, + { 0xB568, 0xB568, LBP_H2 }, + { 0xB569, 0xB583, LBP_H3 }, + { 0xB584, 0xB584, LBP_H2 }, + { 0xB585, 0xB59F, LBP_H3 }, + { 0xB5A0, 0xB5A0, LBP_H2 }, + { 0xB5A1, 0xB5BB, LBP_H3 }, + { 0xB5BC, 0xB5BC, LBP_H2 }, + { 0xB5BD, 0xB5D7, LBP_H3 }, + { 0xB5D8, 0xB5D8, LBP_H2 }, + { 0xB5D9, 0xB5F3, LBP_H3 }, + { 0xB5F4, 0xB5F4, LBP_H2 }, + { 0xB5F5, 0xB60F, LBP_H3 }, + { 0xB610, 0xB610, LBP_H2 }, + { 0xB611, 0xB62B, LBP_H3 }, + { 0xB62C, 0xB62C, LBP_H2 }, + { 0xB62D, 0xB647, LBP_H3 }, + { 0xB648, 0xB648, LBP_H2 }, + { 0xB649, 0xB663, LBP_H3 }, + { 0xB664, 0xB664, LBP_H2 }, + { 0xB665, 0xB67F, LBP_H3 }, + { 0xB680, 0xB680, LBP_H2 }, + { 0xB681, 0xB69B, LBP_H3 }, + { 0xB69C, 0xB69C, LBP_H2 }, + { 0xB69D, 0xB6B7, LBP_H3 }, + { 0xB6B8, 0xB6B8, LBP_H2 }, + { 0xB6B9, 0xB6D3, LBP_H3 }, + { 0xB6D4, 0xB6D4, LBP_H2 }, + { 0xB6D5, 0xB6EF, LBP_H3 }, + { 0xB6F0, 0xB6F0, LBP_H2 }, + { 0xB6F1, 0xB70B, LBP_H3 }, + { 0xB70C, 0xB70C, LBP_H2 }, + { 0xB70D, 0xB727, LBP_H3 }, + { 0xB728, 0xB728, LBP_H2 }, + { 0xB729, 0xB743, LBP_H3 }, + { 0xB744, 0xB744, LBP_H2 }, + { 0xB745, 0xB75F, LBP_H3 }, + { 0xB760, 0xB760, LBP_H2 }, + { 0xB761, 0xB77B, LBP_H3 }, + { 0xB77C, 0xB77C, LBP_H2 }, + { 0xB77D, 0xB797, LBP_H3 }, + { 0xB798, 0xB798, LBP_H2 }, + { 0xB799, 0xB7B3, LBP_H3 }, + { 0xB7B4, 0xB7B4, LBP_H2 }, + { 0xB7B5, 0xB7CF, LBP_H3 }, + { 0xB7D0, 0xB7D0, LBP_H2 }, + { 0xB7D1, 0xB7EB, LBP_H3 }, + { 0xB7EC, 0xB7EC, LBP_H2 }, + { 0xB7ED, 0xB807, LBP_H3 }, + { 0xB808, 0xB808, LBP_H2 }, + { 0xB809, 0xB823, LBP_H3 }, + { 0xB824, 0xB824, LBP_H2 }, + { 0xB825, 0xB83F, LBP_H3 }, + { 0xB840, 0xB840, LBP_H2 }, + { 0xB841, 0xB85B, LBP_H3 }, + { 0xB85C, 0xB85C, LBP_H2 }, + { 0xB85D, 0xB877, LBP_H3 }, + { 0xB878, 0xB878, LBP_H2 }, + { 0xB879, 0xB893, LBP_H3 }, + { 0xB894, 0xB894, LBP_H2 }, + { 0xB895, 0xB8AF, LBP_H3 }, + { 0xB8B0, 0xB8B0, LBP_H2 }, + { 0xB8B1, 0xB8CB, LBP_H3 }, + { 0xB8CC, 0xB8CC, LBP_H2 }, + { 0xB8CD, 0xB8E7, LBP_H3 }, + { 0xB8E8, 0xB8E8, LBP_H2 }, + { 0xB8E9, 0xB903, LBP_H3 }, + { 0xB904, 0xB904, LBP_H2 }, + { 0xB905, 0xB91F, LBP_H3 }, + { 0xB920, 0xB920, LBP_H2 }, + { 0xB921, 0xB93B, LBP_H3 }, + { 0xB93C, 0xB93C, LBP_H2 }, + { 0xB93D, 0xB957, LBP_H3 }, + { 0xB958, 0xB958, LBP_H2 }, + { 0xB959, 0xB973, LBP_H3 }, + { 0xB974, 0xB974, LBP_H2 }, + { 0xB975, 0xB98F, LBP_H3 }, + { 0xB990, 0xB990, LBP_H2 }, + { 0xB991, 0xB9AB, LBP_H3 }, + { 0xB9AC, 0xB9AC, LBP_H2 }, + { 0xB9AD, 0xB9C7, LBP_H3 }, + { 0xB9C8, 0xB9C8, LBP_H2 }, + { 0xB9C9, 0xB9E3, LBP_H3 }, + { 0xB9E4, 0xB9E4, LBP_H2 }, + { 0xB9E5, 0xB9FF, LBP_H3 }, + { 0xBA00, 0xBA00, LBP_H2 }, + { 0xBA01, 0xBA1B, LBP_H3 }, + { 0xBA1C, 0xBA1C, LBP_H2 }, + { 0xBA1D, 0xBA37, LBP_H3 }, + { 0xBA38, 0xBA38, LBP_H2 }, + { 0xBA39, 0xBA53, LBP_H3 }, + { 0xBA54, 0xBA54, LBP_H2 }, + { 0xBA55, 0xBA6F, LBP_H3 }, + { 0xBA70, 0xBA70, LBP_H2 }, + { 0xBA71, 0xBA8B, LBP_H3 }, + { 0xBA8C, 0xBA8C, LBP_H2 }, + { 0xBA8D, 0xBAA7, LBP_H3 }, + { 0xBAA8, 0xBAA8, LBP_H2 }, + { 0xBAA9, 0xBAC3, LBP_H3 }, + { 0xBAC4, 0xBAC4, LBP_H2 }, + { 0xBAC5, 0xBADF, LBP_H3 }, + { 0xBAE0, 0xBAE0, LBP_H2 }, + { 0xBAE1, 0xBAFB, LBP_H3 }, + { 0xBAFC, 0xBAFC, LBP_H2 }, + { 0xBAFD, 0xBB17, LBP_H3 }, + { 0xBB18, 0xBB18, LBP_H2 }, + { 0xBB19, 0xBB33, LBP_H3 }, + { 0xBB34, 0xBB34, LBP_H2 }, + { 0xBB35, 0xBB4F, LBP_H3 }, + { 0xBB50, 0xBB50, LBP_H2 }, + { 0xBB51, 0xBB6B, LBP_H3 }, + { 0xBB6C, 0xBB6C, LBP_H2 }, + { 0xBB6D, 0xBB87, LBP_H3 }, + { 0xBB88, 0xBB88, LBP_H2 }, + { 0xBB89, 0xBBA3, LBP_H3 }, + { 0xBBA4, 0xBBA4, LBP_H2 }, + { 0xBBA5, 0xBBBF, LBP_H3 }, + { 0xBBC0, 0xBBC0, LBP_H2 }, + { 0xBBC1, 0xBBDB, LBP_H3 }, + { 0xBBDC, 0xBBDC, LBP_H2 }, + { 0xBBDD, 0xBBF7, LBP_H3 }, + { 0xBBF8, 0xBBF8, LBP_H2 }, + { 0xBBF9, 0xBC13, LBP_H3 }, + { 0xBC14, 0xBC14, LBP_H2 }, + { 0xBC15, 0xBC2F, LBP_H3 }, + { 0xBC30, 0xBC30, LBP_H2 }, + { 0xBC31, 0xBC4B, LBP_H3 }, + { 0xBC4C, 0xBC4C, LBP_H2 }, + { 0xBC4D, 0xBC67, LBP_H3 }, + { 0xBC68, 0xBC68, LBP_H2 }, + { 0xBC69, 0xBC83, LBP_H3 }, + { 0xBC84, 0xBC84, LBP_H2 }, + { 0xBC85, 0xBC9F, LBP_H3 }, + { 0xBCA0, 0xBCA0, LBP_H2 }, + { 0xBCA1, 0xBCBB, LBP_H3 }, + { 0xBCBC, 0xBCBC, LBP_H2 }, + { 0xBCBD, 0xBCD7, LBP_H3 }, + { 0xBCD8, 0xBCD8, LBP_H2 }, + { 0xBCD9, 0xBCF3, LBP_H3 }, + { 0xBCF4, 0xBCF4, LBP_H2 }, + { 0xBCF5, 0xBD0F, LBP_H3 }, + { 0xBD10, 0xBD10, LBP_H2 }, + { 0xBD11, 0xBD2B, LBP_H3 }, + { 0xBD2C, 0xBD2C, LBP_H2 }, + { 0xBD2D, 0xBD47, LBP_H3 }, + { 0xBD48, 0xBD48, LBP_H2 }, + { 0xBD49, 0xBD63, LBP_H3 }, + { 0xBD64, 0xBD64, LBP_H2 }, + { 0xBD65, 0xBD7F, LBP_H3 }, + { 0xBD80, 0xBD80, LBP_H2 }, + { 0xBD81, 0xBD9B, LBP_H3 }, + { 0xBD9C, 0xBD9C, LBP_H2 }, + { 0xBD9D, 0xBDB7, LBP_H3 }, + { 0xBDB8, 0xBDB8, LBP_H2 }, + { 0xBDB9, 0xBDD3, LBP_H3 }, + { 0xBDD4, 0xBDD4, LBP_H2 }, + { 0xBDD5, 0xBDEF, LBP_H3 }, + { 0xBDF0, 0xBDF0, LBP_H2 }, + { 0xBDF1, 0xBE0B, LBP_H3 }, + { 0xBE0C, 0xBE0C, LBP_H2 }, + { 0xBE0D, 0xBE27, LBP_H3 }, + { 0xBE28, 0xBE28, LBP_H2 }, + { 0xBE29, 0xBE43, LBP_H3 }, + { 0xBE44, 0xBE44, LBP_H2 }, + { 0xBE45, 0xBE5F, LBP_H3 }, + { 0xBE60, 0xBE60, LBP_H2 }, + { 0xBE61, 0xBE7B, LBP_H3 }, + { 0xBE7C, 0xBE7C, LBP_H2 }, + { 0xBE7D, 0xBE97, LBP_H3 }, + { 0xBE98, 0xBE98, LBP_H2 }, + { 0xBE99, 0xBEB3, LBP_H3 }, + { 0xBEB4, 0xBEB4, LBP_H2 }, + { 0xBEB5, 0xBECF, LBP_H3 }, + { 0xBED0, 0xBED0, LBP_H2 }, + { 0xBED1, 0xBEEB, LBP_H3 }, + { 0xBEEC, 0xBEEC, LBP_H2 }, + { 0xBEED, 0xBF07, LBP_H3 }, + { 0xBF08, 0xBF08, LBP_H2 }, + { 0xBF09, 0xBF23, LBP_H3 }, + { 0xBF24, 0xBF24, LBP_H2 }, + { 0xBF25, 0xBF3F, LBP_H3 }, + { 0xBF40, 0xBF40, LBP_H2 }, + { 0xBF41, 0xBF5B, LBP_H3 }, + { 0xBF5C, 0xBF5C, LBP_H2 }, + { 0xBF5D, 0xBF77, LBP_H3 }, + { 0xBF78, 0xBF78, LBP_H2 }, + { 0xBF79, 0xBF93, LBP_H3 }, + { 0xBF94, 0xBF94, LBP_H2 }, + { 0xBF95, 0xBFAF, LBP_H3 }, + { 0xBFB0, 0xBFB0, LBP_H2 }, + { 0xBFB1, 0xBFCB, LBP_H3 }, + { 0xBFCC, 0xBFCC, LBP_H2 }, + { 0xBFCD, 0xBFE7, LBP_H3 }, + { 0xBFE8, 0xBFE8, LBP_H2 }, + { 0xBFE9, 0xC003, LBP_H3 }, + { 0xC004, 0xC004, LBP_H2 }, + { 0xC005, 0xC01F, LBP_H3 }, + { 0xC020, 0xC020, LBP_H2 }, + { 0xC021, 0xC03B, LBP_H3 }, + { 0xC03C, 0xC03C, LBP_H2 }, + { 0xC03D, 0xC057, LBP_H3 }, + { 0xC058, 0xC058, LBP_H2 }, + { 0xC059, 0xC073, LBP_H3 }, + { 0xC074, 0xC074, LBP_H2 }, + { 0xC075, 0xC08F, LBP_H3 }, + { 0xC090, 0xC090, LBP_H2 }, + { 0xC091, 0xC0AB, LBP_H3 }, + { 0xC0AC, 0xC0AC, LBP_H2 }, + { 0xC0AD, 0xC0C7, LBP_H3 }, + { 0xC0C8, 0xC0C8, LBP_H2 }, + { 0xC0C9, 0xC0E3, LBP_H3 }, + { 0xC0E4, 0xC0E4, LBP_H2 }, + { 0xC0E5, 0xC0FF, LBP_H3 }, + { 0xC100, 0xC100, LBP_H2 }, + { 0xC101, 0xC11B, LBP_H3 }, + { 0xC11C, 0xC11C, LBP_H2 }, + { 0xC11D, 0xC137, LBP_H3 }, + { 0xC138, 0xC138, LBP_H2 }, + { 0xC139, 0xC153, LBP_H3 }, + { 0xC154, 0xC154, LBP_H2 }, + { 0xC155, 0xC16F, LBP_H3 }, + { 0xC170, 0xC170, LBP_H2 }, + { 0xC171, 0xC18B, LBP_H3 }, + { 0xC18C, 0xC18C, LBP_H2 }, + { 0xC18D, 0xC1A7, LBP_H3 }, + { 0xC1A8, 0xC1A8, LBP_H2 }, + { 0xC1A9, 0xC1C3, LBP_H3 }, + { 0xC1C4, 0xC1C4, LBP_H2 }, + { 0xC1C5, 0xC1DF, LBP_H3 }, + { 0xC1E0, 0xC1E0, LBP_H2 }, + { 0xC1E1, 0xC1FB, LBP_H3 }, + { 0xC1FC, 0xC1FC, LBP_H2 }, + { 0xC1FD, 0xC217, LBP_H3 }, + { 0xC218, 0xC218, LBP_H2 }, + { 0xC219, 0xC233, LBP_H3 }, + { 0xC234, 0xC234, LBP_H2 }, + { 0xC235, 0xC24F, LBP_H3 }, + { 0xC250, 0xC250, LBP_H2 }, + { 0xC251, 0xC26B, LBP_H3 }, + { 0xC26C, 0xC26C, LBP_H2 }, + { 0xC26D, 0xC287, LBP_H3 }, + { 0xC288, 0xC288, LBP_H2 }, + { 0xC289, 0xC2A3, LBP_H3 }, + { 0xC2A4, 0xC2A4, LBP_H2 }, + { 0xC2A5, 0xC2BF, LBP_H3 }, + { 0xC2C0, 0xC2C0, LBP_H2 }, + { 0xC2C1, 0xC2DB, LBP_H3 }, + { 0xC2DC, 0xC2DC, LBP_H2 }, + { 0xC2DD, 0xC2F7, LBP_H3 }, + { 0xC2F8, 0xC2F8, LBP_H2 }, + { 0xC2F9, 0xC313, LBP_H3 }, + { 0xC314, 0xC314, LBP_H2 }, + { 0xC315, 0xC32F, LBP_H3 }, + { 0xC330, 0xC330, LBP_H2 }, + { 0xC331, 0xC34B, LBP_H3 }, + { 0xC34C, 0xC34C, LBP_H2 }, + { 0xC34D, 0xC367, LBP_H3 }, + { 0xC368, 0xC368, LBP_H2 }, + { 0xC369, 0xC383, LBP_H3 }, + { 0xC384, 0xC384, LBP_H2 }, + { 0xC385, 0xC39F, LBP_H3 }, + { 0xC3A0, 0xC3A0, LBP_H2 }, + { 0xC3A1, 0xC3BB, LBP_H3 }, + { 0xC3BC, 0xC3BC, LBP_H2 }, + { 0xC3BD, 0xC3D7, LBP_H3 }, + { 0xC3D8, 0xC3D8, LBP_H2 }, + { 0xC3D9, 0xC3F3, LBP_H3 }, + { 0xC3F4, 0xC3F4, LBP_H2 }, + { 0xC3F5, 0xC40F, LBP_H3 }, + { 0xC410, 0xC410, LBP_H2 }, + { 0xC411, 0xC42B, LBP_H3 }, + { 0xC42C, 0xC42C, LBP_H2 }, + { 0xC42D, 0xC447, LBP_H3 }, + { 0xC448, 0xC448, LBP_H2 }, + { 0xC449, 0xC463, LBP_H3 }, + { 0xC464, 0xC464, LBP_H2 }, + { 0xC465, 0xC47F, LBP_H3 }, + { 0xC480, 0xC480, LBP_H2 }, + { 0xC481, 0xC49B, LBP_H3 }, + { 0xC49C, 0xC49C, LBP_H2 }, + { 0xC49D, 0xC4B7, LBP_H3 }, + { 0xC4B8, 0xC4B8, LBP_H2 }, + { 0xC4B9, 0xC4D3, LBP_H3 }, + { 0xC4D4, 0xC4D4, LBP_H2 }, + { 0xC4D5, 0xC4EF, LBP_H3 }, + { 0xC4F0, 0xC4F0, LBP_H2 }, + { 0xC4F1, 0xC50B, LBP_H3 }, + { 0xC50C, 0xC50C, LBP_H2 }, + { 0xC50D, 0xC527, LBP_H3 }, + { 0xC528, 0xC528, LBP_H2 }, + { 0xC529, 0xC543, LBP_H3 }, + { 0xC544, 0xC544, LBP_H2 }, + { 0xC545, 0xC55F, LBP_H3 }, + { 0xC560, 0xC560, LBP_H2 }, + { 0xC561, 0xC57B, LBP_H3 }, + { 0xC57C, 0xC57C, LBP_H2 }, + { 0xC57D, 0xC597, LBP_H3 }, + { 0xC598, 0xC598, LBP_H2 }, + { 0xC599, 0xC5B3, LBP_H3 }, + { 0xC5B4, 0xC5B4, LBP_H2 }, + { 0xC5B5, 0xC5CF, LBP_H3 }, + { 0xC5D0, 0xC5D0, LBP_H2 }, + { 0xC5D1, 0xC5EB, LBP_H3 }, + { 0xC5EC, 0xC5EC, LBP_H2 }, + { 0xC5ED, 0xC607, LBP_H3 }, + { 0xC608, 0xC608, LBP_H2 }, + { 0xC609, 0xC623, LBP_H3 }, + { 0xC624, 0xC624, LBP_H2 }, + { 0xC625, 0xC63F, LBP_H3 }, + { 0xC640, 0xC640, LBP_H2 }, + { 0xC641, 0xC65B, LBP_H3 }, + { 0xC65C, 0xC65C, LBP_H2 }, + { 0xC65D, 0xC677, LBP_H3 }, + { 0xC678, 0xC678, LBP_H2 }, + { 0xC679, 0xC693, LBP_H3 }, + { 0xC694, 0xC694, LBP_H2 }, + { 0xC695, 0xC6AF, LBP_H3 }, + { 0xC6B0, 0xC6B0, LBP_H2 }, + { 0xC6B1, 0xC6CB, LBP_H3 }, + { 0xC6CC, 0xC6CC, LBP_H2 }, + { 0xC6CD, 0xC6E7, LBP_H3 }, + { 0xC6E8, 0xC6E8, LBP_H2 }, + { 0xC6E9, 0xC703, LBP_H3 }, + { 0xC704, 0xC704, LBP_H2 }, + { 0xC705, 0xC71F, LBP_H3 }, + { 0xC720, 0xC720, LBP_H2 }, + { 0xC721, 0xC73B, LBP_H3 }, + { 0xC73C, 0xC73C, LBP_H2 }, + { 0xC73D, 0xC757, LBP_H3 }, + { 0xC758, 0xC758, LBP_H2 }, + { 0xC759, 0xC773, LBP_H3 }, + { 0xC774, 0xC774, LBP_H2 }, + { 0xC775, 0xC78F, LBP_H3 }, + { 0xC790, 0xC790, LBP_H2 }, + { 0xC791, 0xC7AB, LBP_H3 }, + { 0xC7AC, 0xC7AC, LBP_H2 }, + { 0xC7AD, 0xC7C7, LBP_H3 }, + { 0xC7C8, 0xC7C8, LBP_H2 }, + { 0xC7C9, 0xC7E3, LBP_H3 }, + { 0xC7E4, 0xC7E4, LBP_H2 }, + { 0xC7E5, 0xC7FF, LBP_H3 }, + { 0xC800, 0xC800, LBP_H2 }, + { 0xC801, 0xC81B, LBP_H3 }, + { 0xC81C, 0xC81C, LBP_H2 }, + { 0xC81D, 0xC837, LBP_H3 }, + { 0xC838, 0xC838, LBP_H2 }, + { 0xC839, 0xC853, LBP_H3 }, + { 0xC854, 0xC854, LBP_H2 }, + { 0xC855, 0xC86F, LBP_H3 }, + { 0xC870, 0xC870, LBP_H2 }, + { 0xC871, 0xC88B, LBP_H3 }, + { 0xC88C, 0xC88C, LBP_H2 }, + { 0xC88D, 0xC8A7, LBP_H3 }, + { 0xC8A8, 0xC8A8, LBP_H2 }, + { 0xC8A9, 0xC8C3, LBP_H3 }, + { 0xC8C4, 0xC8C4, LBP_H2 }, + { 0xC8C5, 0xC8DF, LBP_H3 }, + { 0xC8E0, 0xC8E0, LBP_H2 }, + { 0xC8E1, 0xC8FB, LBP_H3 }, + { 0xC8FC, 0xC8FC, LBP_H2 }, + { 0xC8FD, 0xC917, LBP_H3 }, + { 0xC918, 0xC918, LBP_H2 }, + { 0xC919, 0xC933, LBP_H3 }, + { 0xC934, 0xC934, LBP_H2 }, + { 0xC935, 0xC94F, LBP_H3 }, + { 0xC950, 0xC950, LBP_H2 }, + { 0xC951, 0xC96B, LBP_H3 }, + { 0xC96C, 0xC96C, LBP_H2 }, + { 0xC96D, 0xC987, LBP_H3 }, + { 0xC988, 0xC988, LBP_H2 }, + { 0xC989, 0xC9A3, LBP_H3 }, + { 0xC9A4, 0xC9A4, LBP_H2 }, + { 0xC9A5, 0xC9BF, LBP_H3 }, + { 0xC9C0, 0xC9C0, LBP_H2 }, + { 0xC9C1, 0xC9DB, LBP_H3 }, + { 0xC9DC, 0xC9DC, LBP_H2 }, + { 0xC9DD, 0xC9F7, LBP_H3 }, + { 0xC9F8, 0xC9F8, LBP_H2 }, + { 0xC9F9, 0xCA13, LBP_H3 }, + { 0xCA14, 0xCA14, LBP_H2 }, + { 0xCA15, 0xCA2F, LBP_H3 }, + { 0xCA30, 0xCA30, LBP_H2 }, + { 0xCA31, 0xCA4B, LBP_H3 }, + { 0xCA4C, 0xCA4C, LBP_H2 }, + { 0xCA4D, 0xCA67, LBP_H3 }, + { 0xCA68, 0xCA68, LBP_H2 }, + { 0xCA69, 0xCA83, LBP_H3 }, + { 0xCA84, 0xCA84, LBP_H2 }, + { 0xCA85, 0xCA9F, LBP_H3 }, + { 0xCAA0, 0xCAA0, LBP_H2 }, + { 0xCAA1, 0xCABB, LBP_H3 }, + { 0xCABC, 0xCABC, LBP_H2 }, + { 0xCABD, 0xCAD7, LBP_H3 }, + { 0xCAD8, 0xCAD8, LBP_H2 }, + { 0xCAD9, 0xCAF3, LBP_H3 }, + { 0xCAF4, 0xCAF4, LBP_H2 }, + { 0xCAF5, 0xCB0F, LBP_H3 }, + { 0xCB10, 0xCB10, LBP_H2 }, + { 0xCB11, 0xCB2B, LBP_H3 }, + { 0xCB2C, 0xCB2C, LBP_H2 }, + { 0xCB2D, 0xCB47, LBP_H3 }, + { 0xCB48, 0xCB48, LBP_H2 }, + { 0xCB49, 0xCB63, LBP_H3 }, + { 0xCB64, 0xCB64, LBP_H2 }, + { 0xCB65, 0xCB7F, LBP_H3 }, + { 0xCB80, 0xCB80, LBP_H2 }, + { 0xCB81, 0xCB9B, LBP_H3 }, + { 0xCB9C, 0xCB9C, LBP_H2 }, + { 0xCB9D, 0xCBB7, LBP_H3 }, + { 0xCBB8, 0xCBB8, LBP_H2 }, + { 0xCBB9, 0xCBD3, LBP_H3 }, + { 0xCBD4, 0xCBD4, LBP_H2 }, + { 0xCBD5, 0xCBEF, LBP_H3 }, + { 0xCBF0, 0xCBF0, LBP_H2 }, + { 0xCBF1, 0xCC0B, LBP_H3 }, + { 0xCC0C, 0xCC0C, LBP_H2 }, + { 0xCC0D, 0xCC27, LBP_H3 }, + { 0xCC28, 0xCC28, LBP_H2 }, + { 0xCC29, 0xCC43, LBP_H3 }, + { 0xCC44, 0xCC44, LBP_H2 }, + { 0xCC45, 0xCC5F, LBP_H3 }, + { 0xCC60, 0xCC60, LBP_H2 }, + { 0xCC61, 0xCC7B, LBP_H3 }, + { 0xCC7C, 0xCC7C, LBP_H2 }, + { 0xCC7D, 0xCC97, LBP_H3 }, + { 0xCC98, 0xCC98, LBP_H2 }, + { 0xCC99, 0xCCB3, LBP_H3 }, + { 0xCCB4, 0xCCB4, LBP_H2 }, + { 0xCCB5, 0xCCCF, LBP_H3 }, + { 0xCCD0, 0xCCD0, LBP_H2 }, + { 0xCCD1, 0xCCEB, LBP_H3 }, + { 0xCCEC, 0xCCEC, LBP_H2 }, + { 0xCCED, 0xCD07, LBP_H3 }, + { 0xCD08, 0xCD08, LBP_H2 }, + { 0xCD09, 0xCD23, LBP_H3 }, + { 0xCD24, 0xCD24, LBP_H2 }, + { 0xCD25, 0xCD3F, LBP_H3 }, + { 0xCD40, 0xCD40, LBP_H2 }, + { 0xCD41, 0xCD5B, LBP_H3 }, + { 0xCD5C, 0xCD5C, LBP_H2 }, + { 0xCD5D, 0xCD77, LBP_H3 }, + { 0xCD78, 0xCD78, LBP_H2 }, + { 0xCD79, 0xCD93, LBP_H3 }, + { 0xCD94, 0xCD94, LBP_H2 }, + { 0xCD95, 0xCDAF, LBP_H3 }, + { 0xCDB0, 0xCDB0, LBP_H2 }, + { 0xCDB1, 0xCDCB, LBP_H3 }, + { 0xCDCC, 0xCDCC, LBP_H2 }, + { 0xCDCD, 0xCDE7, LBP_H3 }, + { 0xCDE8, 0xCDE8, LBP_H2 }, + { 0xCDE9, 0xCE03, LBP_H3 }, + { 0xCE04, 0xCE04, LBP_H2 }, + { 0xCE05, 0xCE1F, LBP_H3 }, + { 0xCE20, 0xCE20, LBP_H2 }, + { 0xCE21, 0xCE3B, LBP_H3 }, + { 0xCE3C, 0xCE3C, LBP_H2 }, + { 0xCE3D, 0xCE57, LBP_H3 }, + { 0xCE58, 0xCE58, LBP_H2 }, + { 0xCE59, 0xCE73, LBP_H3 }, + { 0xCE74, 0xCE74, LBP_H2 }, + { 0xCE75, 0xCE8F, LBP_H3 }, + { 0xCE90, 0xCE90, LBP_H2 }, + { 0xCE91, 0xCEAB, LBP_H3 }, + { 0xCEAC, 0xCEAC, LBP_H2 }, + { 0xCEAD, 0xCEC7, LBP_H3 }, + { 0xCEC8, 0xCEC8, LBP_H2 }, + { 0xCEC9, 0xCEE3, LBP_H3 }, + { 0xCEE4, 0xCEE4, LBP_H2 }, + { 0xCEE5, 0xCEFF, LBP_H3 }, + { 0xCF00, 0xCF00, LBP_H2 }, + { 0xCF01, 0xCF1B, LBP_H3 }, + { 0xCF1C, 0xCF1C, LBP_H2 }, + { 0xCF1D, 0xCF37, LBP_H3 }, + { 0xCF38, 0xCF38, LBP_H2 }, + { 0xCF39, 0xCF53, LBP_H3 }, + { 0xCF54, 0xCF54, LBP_H2 }, + { 0xCF55, 0xCF6F, LBP_H3 }, + { 0xCF70, 0xCF70, LBP_H2 }, + { 0xCF71, 0xCF8B, LBP_H3 }, + { 0xCF8C, 0xCF8C, LBP_H2 }, + { 0xCF8D, 0xCFA7, LBP_H3 }, + { 0xCFA8, 0xCFA8, LBP_H2 }, + { 0xCFA9, 0xCFC3, LBP_H3 }, + { 0xCFC4, 0xCFC4, LBP_H2 }, + { 0xCFC5, 0xCFDF, LBP_H3 }, + { 0xCFE0, 0xCFE0, LBP_H2 }, + { 0xCFE1, 0xCFFB, LBP_H3 }, + { 0xCFFC, 0xCFFC, LBP_H2 }, + { 0xCFFD, 0xD017, LBP_H3 }, + { 0xD018, 0xD018, LBP_H2 }, + { 0xD019, 0xD033, LBP_H3 }, + { 0xD034, 0xD034, LBP_H2 }, + { 0xD035, 0xD04F, LBP_H3 }, + { 0xD050, 0xD050, LBP_H2 }, + { 0xD051, 0xD06B, LBP_H3 }, + { 0xD06C, 0xD06C, LBP_H2 }, + { 0xD06D, 0xD087, LBP_H3 }, + { 0xD088, 0xD088, LBP_H2 }, + { 0xD089, 0xD0A3, LBP_H3 }, + { 0xD0A4, 0xD0A4, LBP_H2 }, + { 0xD0A5, 0xD0BF, LBP_H3 }, + { 0xD0C0, 0xD0C0, LBP_H2 }, + { 0xD0C1, 0xD0DB, LBP_H3 }, + { 0xD0DC, 0xD0DC, LBP_H2 }, + { 0xD0DD, 0xD0F7, LBP_H3 }, + { 0xD0F8, 0xD0F8, LBP_H2 }, + { 0xD0F9, 0xD113, LBP_H3 }, + { 0xD114, 0xD114, LBP_H2 }, + { 0xD115, 0xD12F, LBP_H3 }, + { 0xD130, 0xD130, LBP_H2 }, + { 0xD131, 0xD14B, LBP_H3 }, + { 0xD14C, 0xD14C, LBP_H2 }, + { 0xD14D, 0xD167, LBP_H3 }, + { 0xD168, 0xD168, LBP_H2 }, + { 0xD169, 0xD183, LBP_H3 }, + { 0xD184, 0xD184, LBP_H2 }, + { 0xD185, 0xD19F, LBP_H3 }, + { 0xD1A0, 0xD1A0, LBP_H2 }, + { 0xD1A1, 0xD1BB, LBP_H3 }, + { 0xD1BC, 0xD1BC, LBP_H2 }, + { 0xD1BD, 0xD1D7, LBP_H3 }, + { 0xD1D8, 0xD1D8, LBP_H2 }, + { 0xD1D9, 0xD1F3, LBP_H3 }, + { 0xD1F4, 0xD1F4, LBP_H2 }, + { 0xD1F5, 0xD20F, LBP_H3 }, + { 0xD210, 0xD210, LBP_H2 }, + { 0xD211, 0xD22B, LBP_H3 }, + { 0xD22C, 0xD22C, LBP_H2 }, + { 0xD22D, 0xD247, LBP_H3 }, + { 0xD248, 0xD248, LBP_H2 }, + { 0xD249, 0xD263, LBP_H3 }, + { 0xD264, 0xD264, LBP_H2 }, + { 0xD265, 0xD27F, LBP_H3 }, + { 0xD280, 0xD280, LBP_H2 }, + { 0xD281, 0xD29B, LBP_H3 }, + { 0xD29C, 0xD29C, LBP_H2 }, + { 0xD29D, 0xD2B7, LBP_H3 }, + { 0xD2B8, 0xD2B8, LBP_H2 }, + { 0xD2B9, 0xD2D3, LBP_H3 }, + { 0xD2D4, 0xD2D4, LBP_H2 }, + { 0xD2D5, 0xD2EF, LBP_H3 }, + { 0xD2F0, 0xD2F0, LBP_H2 }, + { 0xD2F1, 0xD30B, LBP_H3 }, + { 0xD30C, 0xD30C, LBP_H2 }, + { 0xD30D, 0xD327, LBP_H3 }, + { 0xD328, 0xD328, LBP_H2 }, + { 0xD329, 0xD343, LBP_H3 }, + { 0xD344, 0xD344, LBP_H2 }, + { 0xD345, 0xD35F, LBP_H3 }, + { 0xD360, 0xD360, LBP_H2 }, + { 0xD361, 0xD37B, LBP_H3 }, + { 0xD37C, 0xD37C, LBP_H2 }, + { 0xD37D, 0xD397, LBP_H3 }, + { 0xD398, 0xD398, LBP_H2 }, + { 0xD399, 0xD3B3, LBP_H3 }, + { 0xD3B4, 0xD3B4, LBP_H2 }, + { 0xD3B5, 0xD3CF, LBP_H3 }, + { 0xD3D0, 0xD3D0, LBP_H2 }, + { 0xD3D1, 0xD3EB, LBP_H3 }, + { 0xD3EC, 0xD3EC, LBP_H2 }, + { 0xD3ED, 0xD407, LBP_H3 }, + { 0xD408, 0xD408, LBP_H2 }, + { 0xD409, 0xD423, LBP_H3 }, + { 0xD424, 0xD424, LBP_H2 }, + { 0xD425, 0xD43F, LBP_H3 }, + { 0xD440, 0xD440, LBP_H2 }, + { 0xD441, 0xD45B, LBP_H3 }, + { 0xD45C, 0xD45C, LBP_H2 }, + { 0xD45D, 0xD477, LBP_H3 }, + { 0xD478, 0xD478, LBP_H2 }, + { 0xD479, 0xD493, LBP_H3 }, + { 0xD494, 0xD494, LBP_H2 }, + { 0xD495, 0xD4AF, LBP_H3 }, + { 0xD4B0, 0xD4B0, LBP_H2 }, + { 0xD4B1, 0xD4CB, LBP_H3 }, + { 0xD4CC, 0xD4CC, LBP_H2 }, + { 0xD4CD, 0xD4E7, LBP_H3 }, + { 0xD4E8, 0xD4E8, LBP_H2 }, + { 0xD4E9, 0xD503, LBP_H3 }, + { 0xD504, 0xD504, LBP_H2 }, + { 0xD505, 0xD51F, LBP_H3 }, + { 0xD520, 0xD520, LBP_H2 }, + { 0xD521, 0xD53B, LBP_H3 }, + { 0xD53C, 0xD53C, LBP_H2 }, + { 0xD53D, 0xD557, LBP_H3 }, + { 0xD558, 0xD558, LBP_H2 }, + { 0xD559, 0xD573, LBP_H3 }, + { 0xD574, 0xD574, LBP_H2 }, + { 0xD575, 0xD58F, LBP_H3 }, + { 0xD590, 0xD590, LBP_H2 }, + { 0xD591, 0xD5AB, LBP_H3 }, + { 0xD5AC, 0xD5AC, LBP_H2 }, + { 0xD5AD, 0xD5C7, LBP_H3 }, + { 0xD5C8, 0xD5C8, LBP_H2 }, + { 0xD5C9, 0xD5E3, LBP_H3 }, + { 0xD5E4, 0xD5E4, LBP_H2 }, + { 0xD5E5, 0xD5FF, LBP_H3 }, + { 0xD600, 0xD600, LBP_H2 }, + { 0xD601, 0xD61B, LBP_H3 }, + { 0xD61C, 0xD61C, LBP_H2 }, + { 0xD61D, 0xD637, LBP_H3 }, + { 0xD638, 0xD638, LBP_H2 }, + { 0xD639, 0xD653, LBP_H3 }, + { 0xD654, 0xD654, LBP_H2 }, + { 0xD655, 0xD66F, LBP_H3 }, + { 0xD670, 0xD670, LBP_H2 }, + { 0xD671, 0xD68B, LBP_H3 }, + { 0xD68C, 0xD68C, LBP_H2 }, + { 0xD68D, 0xD6A7, LBP_H3 }, + { 0xD6A8, 0xD6A8, LBP_H2 }, + { 0xD6A9, 0xD6C3, LBP_H3 }, + { 0xD6C4, 0xD6C4, LBP_H2 }, + { 0xD6C5, 0xD6DF, LBP_H3 }, + { 0xD6E0, 0xD6E0, LBP_H2 }, + { 0xD6E1, 0xD6FB, LBP_H3 }, + { 0xD6FC, 0xD6FC, LBP_H2 }, + { 0xD6FD, 0xD717, LBP_H3 }, + { 0xD718, 0xD718, LBP_H2 }, + { 0xD719, 0xD733, LBP_H3 }, + { 0xD734, 0xD734, LBP_H2 }, + { 0xD735, 0xD74F, LBP_H3 }, + { 0xD750, 0xD750, LBP_H2 }, + { 0xD751, 0xD76B, LBP_H3 }, + { 0xD76C, 0xD76C, LBP_H2 }, + { 0xD76D, 0xD787, LBP_H3 }, + { 0xD788, 0xD788, LBP_H2 }, + { 0xD789, 0xD7A3, LBP_H3 }, + { 0xD7B0, 0xD7C6, LBP_JV }, + { 0xD7CB, 0xD7FB, LBP_JT }, + { 0xD800, 0xDFFF, LBP_SG }, + { 0xE000, 0xF8FF, LBP_XX }, + { 0xF900, 0xFAFF, LBP_ID }, + { 0xFB00, 0xFB1D, LBP_AL }, + { 0xFB1E, 0xFB1E, LBP_CM }, + { 0xFB1F, 0xFD3D, LBP_AL }, + { 0xFD3E, 0xFD3E, LBP_OP }, + { 0xFD3F, 0xFD3F, LBP_CL }, + { 0xFD50, 0xFDFB, LBP_AL }, + { 0xFDFC, 0xFDFC, LBP_PO }, + { 0xFDFD, 0xFDFD, LBP_AL }, + { 0xFE00, 0xFE0F, LBP_CM }, + { 0xFE10, 0xFE10, LBP_IS }, + { 0xFE11, 0xFE12, LBP_CL }, + { 0xFE13, 0xFE14, LBP_IS }, + { 0xFE15, 0xFE16, LBP_EX }, + { 0xFE17, 0xFE17, LBP_OP }, + { 0xFE18, 0xFE18, LBP_CL }, + { 0xFE19, 0xFE19, LBP_IN }, + { 0xFE20, 0xFE26, LBP_CM }, + { 0xFE30, 0xFE34, LBP_ID }, + { 0xFE35, 0xFE35, LBP_OP }, + { 0xFE36, 0xFE36, LBP_CL }, + { 0xFE37, 0xFE37, LBP_OP }, + { 0xFE38, 0xFE38, LBP_CL }, + { 0xFE39, 0xFE39, LBP_OP }, + { 0xFE3A, 0xFE3A, LBP_CL }, + { 0xFE3B, 0xFE3B, LBP_OP }, + { 0xFE3C, 0xFE3C, LBP_CL }, + { 0xFE3D, 0xFE3D, LBP_OP }, + { 0xFE3E, 0xFE3E, LBP_CL }, + { 0xFE3F, 0xFE3F, LBP_OP }, + { 0xFE40, 0xFE40, LBP_CL }, + { 0xFE41, 0xFE41, LBP_OP }, + { 0xFE42, 0xFE42, LBP_CL }, + { 0xFE43, 0xFE43, LBP_OP }, + { 0xFE44, 0xFE44, LBP_CL }, + { 0xFE45, 0xFE46, LBP_ID }, + { 0xFE47, 0xFE47, LBP_OP }, + { 0xFE48, 0xFE48, LBP_CL }, + { 0xFE49, 0xFE4F, LBP_ID }, + { 0xFE50, 0xFE50, LBP_CL }, + { 0xFE51, 0xFE51, LBP_ID }, + { 0xFE52, 0xFE52, LBP_CL }, + { 0xFE54, 0xFE55, LBP_NS }, + { 0xFE56, 0xFE57, LBP_EX }, + { 0xFE58, 0xFE58, LBP_ID }, + { 0xFE59, 0xFE59, LBP_OP }, + { 0xFE5A, 0xFE5A, LBP_CL }, + { 0xFE5B, 0xFE5B, LBP_OP }, + { 0xFE5C, 0xFE5C, LBP_CL }, + { 0xFE5D, 0xFE5D, LBP_OP }, + { 0xFE5E, 0xFE5E, LBP_CL }, + { 0xFE5F, 0xFE68, LBP_ID }, + { 0xFE69, 0xFE69, LBP_PR }, + { 0xFE6A, 0xFE6A, LBP_PO }, + { 0xFE6B, 0xFE6B, LBP_ID }, + { 0xFE70, 0xFEFC, LBP_AL }, + { 0xFEFF, 0xFEFF, LBP_WJ }, + { 0xFF01, 0xFF01, LBP_EX }, + { 0xFF02, 0xFF03, LBP_ID }, + { 0xFF04, 0xFF04, LBP_PR }, + { 0xFF05, 0xFF05, LBP_PO }, + { 0xFF06, 0xFF07, LBP_ID }, + { 0xFF08, 0xFF08, LBP_OP }, + { 0xFF09, 0xFF09, LBP_CL }, + { 0xFF0A, 0xFF0B, LBP_ID }, + { 0xFF0C, 0xFF0C, LBP_CL }, + { 0xFF0D, 0xFF0D, LBP_ID }, + { 0xFF0E, 0xFF0E, LBP_CL }, + { 0xFF0F, 0xFF19, LBP_ID }, + { 0xFF1A, 0xFF1B, LBP_NS }, + { 0xFF1C, 0xFF1E, LBP_ID }, + { 0xFF1F, 0xFF1F, LBP_EX }, + { 0xFF20, 0xFF3A, LBP_ID }, + { 0xFF3B, 0xFF3B, LBP_OP }, + { 0xFF3C, 0xFF3C, LBP_ID }, + { 0xFF3D, 0xFF3D, LBP_CL }, + { 0xFF3E, 0xFF5A, LBP_ID }, + { 0xFF5B, 0xFF5B, LBP_OP }, + { 0xFF5C, 0xFF5C, LBP_ID }, + { 0xFF5D, 0xFF5D, LBP_CL }, + { 0xFF5E, 0xFF5E, LBP_ID }, + { 0xFF5F, 0xFF5F, LBP_OP }, + { 0xFF60, 0xFF61, LBP_CL }, + { 0xFF62, 0xFF62, LBP_OP }, + { 0xFF63, 0xFF64, LBP_CL }, + { 0xFF65, 0xFF65, LBP_NS }, + { 0xFF66, 0xFF66, LBP_AL }, + { 0xFF67, 0xFF70, LBP_NS }, + { 0xFF71, 0xFF9D, LBP_AL }, + { 0xFF9E, 0xFF9F, LBP_NS }, + { 0xFFA0, 0xFFDC, LBP_AL }, + { 0xFFE0, 0xFFE0, LBP_PO }, + { 0xFFE1, 0xFFE1, LBP_PR }, + { 0xFFE2, 0xFFE4, LBP_ID }, + { 0xFFE5, 0xFFE6, LBP_PR }, + { 0xFFE8, 0xFFEE, LBP_AL }, + { 0xFFF9, 0xFFFB, LBP_CM }, + { 0xFFFC, 0xFFFC, LBP_CB }, + { 0xFFFD, 0xFFFD, LBP_AI }, + { 0x10000, 0x100FA, LBP_AL }, + { 0x10100, 0x10102, LBP_BA }, + { 0x10107, 0x101FC, LBP_AL }, + { 0x101FD, 0x101FD, LBP_CM }, + { 0x10280, 0x1039D, LBP_AL }, + { 0x1039F, 0x1039F, LBP_BA }, + { 0x103A0, 0x103CF, LBP_AL }, + { 0x103D0, 0x103D0, LBP_BA }, + { 0x103D1, 0x1049D, LBP_AL }, + { 0x104A0, 0x104A9, LBP_NU }, + { 0x10800, 0x10855, LBP_AL }, + { 0x10857, 0x10857, LBP_BA }, + { 0x10858, 0x1091B, LBP_AL }, + { 0x1091F, 0x1091F, LBP_BA }, + { 0x10920, 0x10A00, LBP_AL }, + { 0x10A01, 0x10A0F, LBP_CM }, + { 0x10A10, 0x10A33, LBP_AL }, + { 0x10A38, 0x10A3F, LBP_CM }, + { 0x10A40, 0x10A47, LBP_AL }, + { 0x10A50, 0x10A57, LBP_BA }, + { 0x10A58, 0x10B35, LBP_AL }, + { 0x10B39, 0x10B3F, LBP_BA }, + { 0x10B40, 0x10E7E, LBP_AL }, + { 0x11000, 0x11002, LBP_CM }, + { 0x11003, 0x11037, LBP_AL }, + { 0x11038, 0x11046, LBP_CM }, + { 0x11047, 0x11048, LBP_BA }, + { 0x11049, 0x11065, LBP_AL }, + { 0x11066, 0x1106F, LBP_NU }, + { 0x11080, 0x11082, LBP_CM }, + { 0x11083, 0x110AF, LBP_AL }, + { 0x110B0, 0x110BA, LBP_CM }, + { 0x110BB, 0x110BD, LBP_AL }, + { 0x110BE, 0x110C1, LBP_BA }, + { 0x12000, 0x12462, LBP_AL }, + { 0x12470, 0x12473, LBP_BA }, + { 0x13000, 0x13257, LBP_AL }, + { 0x13258, 0x1325A, LBP_OP }, + { 0x1325B, 0x1325D, LBP_CL }, + { 0x1325E, 0x13281, LBP_AL }, + { 0x13282, 0x13282, LBP_CL }, + { 0x13283, 0x13285, LBP_AL }, + { 0x13286, 0x13286, LBP_OP }, + { 0x13287, 0x13287, LBP_CL }, + { 0x13288, 0x13288, LBP_OP }, + { 0x13289, 0x13289, LBP_CL }, + { 0x1328A, 0x13378, LBP_AL }, + { 0x13379, 0x13379, LBP_OP }, + { 0x1337A, 0x1337B, LBP_CL }, + { 0x1337C, 0x16A38, LBP_AL }, + { 0x1B000, 0x1B001, LBP_ID }, + { 0x1D000, 0x1D164, LBP_AL }, + { 0x1D165, 0x1D169, LBP_CM }, + { 0x1D16A, 0x1D16C, LBP_AL }, + { 0x1D16D, 0x1D182, LBP_CM }, + { 0x1D183, 0x1D184, LBP_AL }, + { 0x1D185, 0x1D18B, LBP_CM }, + { 0x1D18C, 0x1D1A9, LBP_AL }, + { 0x1D1AA, 0x1D1AD, LBP_CM }, + { 0x1D1AE, 0x1D241, LBP_AL }, + { 0x1D242, 0x1D244, LBP_CM }, + { 0x1D245, 0x1D7CB, LBP_AL }, + { 0x1D7CE, 0x1D7FF, LBP_NU }, + { 0x1F000, 0x1F0DF, LBP_AL }, + { 0x1F100, 0x1F12D, LBP_AI }, + { 0x1F12E, 0x1F12E, LBP_AL }, + { 0x1F130, 0x1F19A, LBP_AI }, + { 0x1F1E6, 0x1F1FF, LBP_AL }, + { 0x1F200, 0x1F251, LBP_ID }, + { 0x1F300, 0x1F773, LBP_AL }, + { 0x20000, 0x3FFFD, LBP_ID }, + { 0xE0001, 0xE01EF, LBP_CM }, + { 0xF0000, 0x10FFFD, LBP_XX }, + { 0xFFFFFFFF, 0xFFFFFFFF, LBP_Undefined } +}; diff --git a/linebreak/linebreak/linebreakdata1.tmpl b/linebreak/linebreak/linebreakdata1.tmpl new file mode 100644 index 0000000..df06125 --- /dev/null +++ b/linebreak/linebreak/linebreakdata1.tmpl @@ -0,0 +1 @@ +/* The content of this file is generated from: diff --git a/linebreak/linebreak/linebreakdata2.tmpl b/linebreak/linebreak/linebreakdata2.tmpl new file mode 100644 index 0000000..60d0d37 --- /dev/null +++ b/linebreak/linebreak/linebreakdata2.tmpl @@ -0,0 +1,7 @@ +*/ + +#include "linebreak.h" +#include "linebreakdef.h" + +/** Default line breaking properties as from the Unicode Web site. */ +struct LineBreakProperties lb_prop_default[] = { diff --git a/linebreak/linebreak/linebreakdata3.tmpl b/linebreak/linebreak/linebreakdata3.tmpl new file mode 100644 index 0000000..a77017c --- /dev/null +++ b/linebreak/linebreak/linebreakdata3.tmpl @@ -0,0 +1,2 @@ + { 0xFFFFFFFF, 0xFFFFFFFF, LBP_Undefined } +}; diff --git a/linebreak/linebreak/linebreakdef.c b/linebreak/linebreak/linebreakdef.c new file mode 100644 index 0000000..eee01cb --- /dev/null +++ b/linebreak/linebreak/linebreakdef.c @@ -0,0 +1,139 @@ +/* vim: set tabstop=4 shiftwidth=4: */ + +/* + * Line breaking in a Unicode sequence. Designed to be used in a + * generic text renderer. + * + * Copyright (C) 2008-2011 Wu Yongwei + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute + * it freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must + * not claim that you wrote the original software. If you use this + * software in a product, an acknowledgement in the product + * documentation would be appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must + * not be misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source + * distribution. + * + * The main reference is Unicode Standard Annex 14 (UAX #14): + * + * + * When this library was designed, this annex was at Revision 19, for + * Unicode 5.0.0: + * + * + * This library has been updated according to Revision 26, for + * Unicode 6.0.0: + * + * + * The Unicode Terms of Use are available at + * + */ + +/** + * @file linebreakdef.c + * + * Definition of language-specific data. + * + * @version 2.1, 2011/05/07 + * @author Wu Yongwei + */ + +#include "linebreak.h" +#include "linebreakdef.h" + +/** + * English-specifc data over the default Unicode rules. + */ +static struct LineBreakProperties lb_prop_English[] = { + { 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */ + { 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */ + { 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */ + { 0, 0, LBP_Undefined } +}; + +/** + * German-specifc data over the default Unicode rules. + */ +static struct LineBreakProperties lb_prop_German[] = { + { 0x00AB, 0x00AB, LBP_CL }, /* Left double angle quotation mark: closing */ + { 0x00BB, 0x00BB, LBP_OP }, /* Right double angle quotation mark: opening */ + { 0x2018, 0x2018, LBP_CL }, /* Left single quotation mark: closing */ + { 0x201C, 0x201C, LBP_CL }, /* Left double quotation mark: closing */ + { 0x2039, 0x2039, LBP_CL }, /* Left single angle quotation mark: closing */ + { 0x203A, 0x203A, LBP_OP }, /* Right single angle quotation mark: opening */ + { 0, 0, LBP_Undefined } +}; + +/** + * Spanish-specifc data over the default Unicode rules. + */ +static struct LineBreakProperties lb_prop_Spanish[] = { + { 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */ + { 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */ + { 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */ + { 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */ + { 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */ + { 0x2039, 0x2039, LBP_OP }, /* Left single angle quotation mark: opening */ + { 0x203A, 0x203A, LBP_CL }, /* Right single angle quotation mark: closing */ + { 0, 0, LBP_Undefined } +}; + +/** + * French-specifc data over the default Unicode rules. + */ +static struct LineBreakProperties lb_prop_French[] = { + { 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */ + { 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */ + { 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */ + { 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */ + { 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */ + { 0x2039, 0x2039, LBP_OP }, /* Left single angle quotation mark: opening */ + { 0x203A, 0x203A, LBP_CL }, /* Right single angle quotation mark: closing */ + { 0, 0, LBP_Undefined } +}; + +/** + * Russian-specifc data over the default Unicode rules. + */ +static struct LineBreakProperties lb_prop_Russian[] = { + { 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */ + { 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */ + { 0x201C, 0x201C, LBP_CL }, /* Left double quotation mark: closing */ + { 0, 0, LBP_Undefined } +}; + +/** + * Chinese-specifc data over the default Unicode rules. + */ +static struct LineBreakProperties lb_prop_Chinese[] = { + { 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */ + { 0x2019, 0x2019, LBP_CL }, /* Right single quotation mark: closing */ + { 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */ + { 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */ + { 0, 0, LBP_Undefined } +}; + +/** + * Association data of language-specific line breaking properties with + * language names. This is the definition for the static data in this + * file. If you want more flexibility, or do not need the data here, + * you may want to redefine \e lb_prop_lang_map in your C source file. + */ +struct LineBreakPropertiesLang lb_prop_lang_map[] = { + { "en", 2, lb_prop_English }, + { "de", 2, lb_prop_German }, + { "es", 2, lb_prop_Spanish }, + { "fr", 2, lb_prop_French }, + { "ru", 2, lb_prop_Russian }, + { "zh", 2, lb_prop_Chinese }, + { NULL, 0, NULL } +}; diff --git a/linebreak/linebreak/linebreakdef.h b/linebreak/linebreak/linebreakdef.h new file mode 100644 index 0000000..8eb2f51 --- /dev/null +++ b/linebreak/linebreak/linebreakdef.h @@ -0,0 +1,149 @@ +/* vim: set tabstop=4 shiftwidth=4: */ + +/* + * Line breaking in a Unicode sequence. Designed to be used in a + * generic text renderer. + * + * Copyright (C) 2008-2011 Wu Yongwei + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute + * it freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must + * not claim that you wrote the original software. If you use this + * software in a product, an acknowledgement in the product + * documentation would be appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must + * not be misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source + * distribution. + * + * The main reference is Unicode Standard Annex 14 (UAX #14): + * + * + * When this library was designed, this annex was at Revision 19, for + * Unicode 5.0.0: + * + * + * This library has been updated according to Revision 26, for + * Unicode 6.0.0: + * + * + * The Unicode Terms of Use are available at + * + */ + +/** + * @file linebreakdef.h + * + * Definitions of internal data structures, declarations of global + * variables, and function prototypes for the line breaking algorithm. + * + * @version 2.1, 2011/05/07 + * @author Wu Yongwei + */ + +/** + * Constant value to mark the end of string. It is not a valid Unicode + * character. + */ +#define EOS 0xFFFF + +/** + * Line break classes. This is a direct mapping of Table 1 of Unicode + * Standard Annex 14, Revision 26. + */ +enum LineBreakClass +{ + /* This is used to signal an error condition. */ + LBP_Undefined, /**< Undefined */ + + /* The following break classes are treated in the pair table. */ + LBP_OP, /**< Opening punctuation */ + LBP_CL, /**< Closing punctuation */ + LBP_CP, /**< Closing parenthesis */ + LBP_QU, /**< Ambiguous quotation */ + LBP_GL, /**< Glue */ + LBP_NS, /**< Non-starters */ + LBP_EX, /**< Exclamation/Interrogation */ + LBP_SY, /**< Symbols allowing break after */ + LBP_IS, /**< Infix separator */ + LBP_PR, /**< Prefix */ + LBP_PO, /**< Postfix */ + LBP_NU, /**< Numeric */ + LBP_AL, /**< Alphabetic */ + LBP_ID, /**< Ideographic */ + LBP_IN, /**< Inseparable characters */ + LBP_HY, /**< Hyphen */ + LBP_BA, /**< Break after */ + LBP_BB, /**< Break before */ + LBP_B2, /**< Break on either side (but not pair) */ + LBP_ZW, /**< Zero-width space */ + LBP_CM, /**< Combining marks */ + LBP_WJ, /**< Word joiner */ + LBP_H2, /**< Hangul LV */ + LBP_H3, /**< Hangul LVT */ + LBP_JL, /**< Hangul L Jamo */ + LBP_JV, /**< Hangul V Jamo */ + LBP_JT, /**< Hangul T Jamo */ + + /* The following break classes are not treated in the pair table */ + LBP_AI, /**< Ambiguous (alphabetic or ideograph) */ + LBP_BK, /**< Break (mandatory) */ + LBP_CB, /**< Contingent break */ + LBP_CR, /**< Carriage return */ + LBP_LF, /**< Line feed */ + LBP_NL, /**< Next line */ + LBP_SA, /**< South-East Asian */ + LBP_SG, /**< Surrogates */ + LBP_SP, /**< Space */ + LBP_XX /**< Unknown */ +}; + +/** + * Struct for entries of line break properties. The array of the + * entries \e must be sorted. + */ +struct LineBreakProperties +{ + utf32_t start; /**< Starting coding point */ + utf32_t end; /**< End coding point */ + enum LineBreakClass prop; /**< The line breaking property */ +}; + +/** + * Struct for association of language-specific line breaking properties + * with language names. + */ +struct LineBreakPropertiesLang +{ + const char *lang; /**< Language name */ + size_t namelen; /**< Length of name to match */ + struct LineBreakProperties *lbp; /**< Pointer to associated data */ +}; + +/** + * Abstract function interface for #lb_get_next_char_utf8, + * #lb_get_next_char_utf16, and #lb_get_next_char_utf32. + */ +typedef utf32_t (*get_next_char_t)(const void *, size_t, size_t *); + +/* Declarations */ +extern struct LineBreakProperties lb_prop_default[]; +extern struct LineBreakPropertiesLang lb_prop_lang_map[]; + +/* Function Prototype */ +utf32_t lb_get_next_char_utf8(const utf8_t *s, size_t len, size_t *ip); +utf32_t lb_get_next_char_utf16(const utf16_t *s, size_t len, size_t *ip); +utf32_t lb_get_next_char_utf32(const utf32_t *s, size_t len, size_t *ip); +void set_linebreaks( + const void *s, + size_t len, + const char *lang, + char *brks, + get_next_char_t get_next_char); diff --git a/linebreak/linebreak/purge b/linebreak/linebreak/purge new file mode 100755 index 0000000..c243c2b --- /dev/null +++ b/linebreak/linebreak/purge @@ -0,0 +1,2 @@ +#! /bin/sh +rm -rf Makefile.in aclocal.m4 autom4te.cache/ config.guess config.h.in config.sub configure depcomp doc/ install-sh ltmain.sh missing diff --git a/linebreak/linebreak/sort_numeric_hex.py b/linebreak/linebreak/sort_numeric_hex.py new file mode 100755 index 0000000..cdbd2b5 --- /dev/null +++ b/linebreak/linebreak/sort_numeric_hex.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python +import sys + +lines = open(sys.argv[1]).readlines() +lines_out = sorted(lines, key=lambda line: int(line.split("..")[0], 16)) +map(sys.stdout.write, lines_out) diff --git a/linebreak/linebreak/wordbreak.c b/linebreak/linebreak/wordbreak.c new file mode 100644 index 0000000..60db99e --- /dev/null +++ b/linebreak/linebreak/wordbreak.c @@ -0,0 +1,437 @@ +/* vim: set tabstop=4 shiftwidth=4: */ + +/* + * Word breaking in a Unicode sequence. Designed to be used in a + * generic text renderer. + * + * Copyright (C) 2012 Tom Hacohen + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute + * it freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must + * not claim that you wrote the original software. If you use this + * software in a product, an acknowledgement in the product + * documentation would be appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must + * not be misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source + * distribution. + * + * The main reference is Unicode Standard Annex 29 (UAX #29): + * + * + * When this library was designed, this annex was at Revision 17, for + * Unicode 6.0.0: + * + * + * The Unicode Terms of Use are available at + * + */ + +/** + * @file wordbreak.c + * + * Implementation of the word breaking algorithm as described in Unicode + * Standard Annex 29. + * + * @version 2.2, 2012/02/04 + * @author Tom Hacohen + */ + +#include +#include +#include +#include "linebreak.h" +#include "linebreakdef.h" + +#include "wordbreak.h" +#include "wordbreakdata.c" + +#define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0])) + +/** + * Initializes the wordbreak internals. It currently does nothing, but + * it may in the future. + */ +void init_wordbreak(void) +{ +} + +/** + * Gets the word breaking class of a character. + * + * @param ch character to check + * @param wbp pointer to the wbp breaking properties array + * @param len size of the wbp array in number of items + * @return the word breaking class if found; \c WBP_Any otherwise + */ +static enum WordBreakClass get_char_wb_class( + utf32_t ch, + struct WordBreakProperties *wbp, + size_t len) +{ + int min = 0; + int max = len - 1; + int mid; + + do + { + mid = (min + max) / 2; + + if (ch < wbp[mid].start) + max = mid - 1; + else if (ch > wbp[mid].end) + min = mid + 1; + else + return wbp[mid].prop; + } + while (min <= max); + + return WBP_Any; +} + +/** + * Sets the word break types to a specific value in a range. + * + * It sets the inside chars to #WORDBREAK_INSIDEACHAR and the rest to brkType. + * Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are + * cells that we really don't want to break after. + * + * @param[in] s input string + * @param[out] brks breaks array to fill + * @param[in] posStart start position + * @param[in] posEnd end position (exclusive) + * @param[in] len length of the string + * @param[in] brkType breaks type to use + * @param[in] get_next_char function to get the next UTF-32 character + */ +static void set_brks_to( + const void *s, + char *brks, + size_t posStart, + size_t posEnd, + size_t len, + char brkType, + get_next_char_t get_next_char) +{ + size_t posNext = posStart; + while (posNext < posEnd) + { + utf32_t ch; + ch = get_next_char(s, len, &posNext); + assert(ch != EOS); + for (; posStart < posNext - 1; ++posStart) + brks[posStart] = WORDBREAK_INSIDEACHAR; + assert(posStart == posNext - 1); + + /* Only set it if we haven't set it not to break before. */ + if (brks[posStart] != WORDBREAK_NOBREAK) + brks[posStart] = brkType; + posStart = posNext; + } +} + +/* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */ +#define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \ + (cls == WBP_LF)) + +/** + * Sets the word breaking information for a generic input string. + * + * @param[in] s input string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or + * #WORDBREAK_INSIDEACHAR + * @param[in] get_next_char function to get the next UTF-32 character + */ +static void set_wordbreaks( + const void *s, + size_t len, + const char *lang, + char *brks, + get_next_char_t get_next_char) +{ + enum WordBreakClass wbcLast = WBP_Undefined; + /* wbcSeqStart is the class that started the current sequence. + * WBP_Undefined is a special case that means "sot". + * This value is the class that is at the start of the current rule + * matching sequence. For example, in case of Numeric+MidNum+Numeric + * it'll be Numeric all the way. + */ + enum WordBreakClass wbcSeqStart = WBP_Undefined; + utf32_t ch; + size_t posNext = 0; + size_t posCur = 0; + size_t posLast = 0; + + /* TODO: Language-specific specialization. */ + (void) lang; + + /* Init brks. */ + memset(brks, WORDBREAK_BREAK, len); + + ch = get_next_char(s, len, &posNext); + + while (ch != EOS) + { + enum WordBreakClass wbcCur; + wbcCur = get_char_wb_class(ch, wb_prop_default, + ARRAY_LEN(wb_prop_default)); + + switch (wbcCur) + { + case WBP_CR: + /* WB3b */ + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + wbcSeqStart = wbcCur; + posLast = posCur; + break; + + case WBP_LF: + if (wbcSeqStart == WBP_CR) /* WB3 */ + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_NOBREAK, get_next_char); + wbcSeqStart = wbcCur; + posLast = posCur; + break; + } + /* Fall off */ + + case WBP_Newline: + /* WB3a,3b */ + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + wbcSeqStart = wbcCur; + posLast = posCur; + break; + + case WBP_Extend: + case WBP_Format: + /* WB4 - If not the first char/after a newline (WB3a,3b), skip + * this class, set it to be the same as the prev, and mark + * brks not to break before them. */ + if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart)) + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + wbcSeqStart = wbcCur; + } + else + { + /* It's surely not the first */ + brks[posCur - 1] = WORDBREAK_NOBREAK; + /* "inherit" the previous class. */ + wbcCur = wbcLast; + } + break; + + case WBP_Katakana: + if ((wbcSeqStart == WBP_Katakana) || /* WB13 */ + (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_NOBREAK, get_next_char); + } + /* No rule found, reset */ + else + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + } + wbcSeqStart = wbcCur; + posLast = posCur; + break; + + case WBP_ALetter: + if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */ + (wbcLast == WBP_Numeric) || /* WB10 */ + (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_NOBREAK, get_next_char); + } + /* No rule found, reset */ + else + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + } + wbcSeqStart = wbcCur; + posLast = posCur; + break; + + case WBP_MidNumLet: + if ((wbcLast == WBP_ALetter) || /* WB6,7 */ + (wbcLast == WBP_Numeric)) /* WB11,12 */ + { + /* Go on */ + } + else + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + wbcSeqStart = wbcCur; + posLast = posCur; + } + break; + + case WBP_MidLetter: + if (wbcLast == WBP_ALetter) /* WB6,7 */ + { + /* Go on */ + } + else + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + wbcSeqStart = wbcCur; + posLast = posCur; + } + break; + + case WBP_MidNum: + if (wbcLast == WBP_Numeric) /* WB11,12 */ + { + /* Go on */ + } + else + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + wbcSeqStart = wbcCur; + posLast = posCur; + } + break; + + case WBP_Numeric: + if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */ + (wbcLast == WBP_ALetter) || /* WB9 */ + (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_NOBREAK, get_next_char); + } + /* No rule found, reset */ + else + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + } + wbcSeqStart = wbcCur; + posLast = posCur; + break; + + case WBP_ExtendNumLet: + /* WB13a,13b */ + if ((wbcSeqStart == wbcLast) && + ((wbcLast == WBP_ALetter) || + (wbcLast == WBP_Numeric) || + (wbcLast == WBP_Katakana) || + (wbcLast == WBP_ExtendNumLet))) + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_NOBREAK, get_next_char); + } + /* No rule found, reset */ + else + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + } + wbcSeqStart = wbcCur; + posLast = posCur; + break; + + case WBP_Any: + /* Allow breaks and reset */ + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + wbcSeqStart = wbcCur; + posLast = posCur; + break; + + default: + /* Error, should never get here! */ + assert(0); + break; + } + + wbcLast = wbcCur; + posCur = posNext; + ch = get_next_char(s, len, &posNext); + } + + /* WB2 */ + set_brks_to(s, brks, posLast, posNext, len, + WORDBREAK_BREAK, get_next_char); +} + +/** + * Sets the word breaking information for a UTF-8 input string. + * + * @param[in] s input UTF-8 string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or + * #WORDBREAK_INSIDEACHAR + */ +void set_wordbreaks_utf8( + const utf8_t *s, + size_t len, + const char *lang, + char *brks) +{ + set_wordbreaks(s, len, lang, brks, + (get_next_char_t)lb_get_next_char_utf8); +} + +/** + * Sets the word breaking information for a UTF-16 input string. + * + * @param[in] s input UTF-16 string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or + * #WORDBREAK_INSIDEACHAR + */ +void set_wordbreaks_utf16( + const utf16_t *s, + size_t len, + const char *lang, + char *brks) +{ + set_wordbreaks(s, len, lang, brks, + (get_next_char_t)lb_get_next_char_utf16); +} + +/** + * Sets the word breaking information for a UTF-32 input string. + * + * @param[in] s input UTF-32 string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or + * #WORDBREAK_INSIDEACHAR + */ +void set_wordbreaks_utf32( + const utf32_t *s, + size_t len, + const char *lang, + char *brks) +{ + set_wordbreaks(s, len, lang, brks, + (get_next_char_t)lb_get_next_char_utf32); +} diff --git a/linebreak/linebreak/wordbreak.h b/linebreak/linebreak/wordbreak.h new file mode 100644 index 0000000..47bef27 --- /dev/null +++ b/linebreak/linebreak/wordbreak.h @@ -0,0 +1,72 @@ +/* vim: set tabstop=4 shiftwidth=4: */ + +/* + * Word breaking in a Unicode sequence. Designed to be used in a + * generic text renderer. + * + * Copyright (C) 2012 Tom Hacohen + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute + * it freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must + * not claim that you wrote the original software. If you use this + * software in a product, an acknowledgement in the product + * documentation would be appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must + * not be misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source + * distribution. + * + * The main reference is Unicode Standard Annex 29 (UAX #29): + * + * + * When this library was designed, this annex was at Revision 17, for + * Unicode 6.0.0: + * + * + * The Unicode Terms of Use are available at + * + */ + +/** + * @file wordbreak.h + * + * Header file for the word breaking (segmentation) algorithm. + * + * @version 2.2, 2012/02/04 + * @author Tom Hacohen + */ + +#ifndef WORDBREAK_H +#define WORDBREAK_H + +#include +#include "linebreak.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define WORDBREAK_BREAK 0 /**< Break is allowed */ +#define WORDBREAK_NOBREAK 1 /**< No break is allowed */ +#define WORDBREAK_INSIDEACHAR 2 /**< A UTF-8/16 sequence is unfinished */ + +void init_wordbreak(void); +void set_wordbreaks_utf8( + const utf8_t *s, size_t len, const char* lang, char *brks); +void set_wordbreaks_utf16( + const utf16_t *s, size_t len, const char* lang, char *brks); +void set_wordbreaks_utf32( + const utf32_t *s, size_t len, const char* lang, char *brks); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/linebreak/linebreak/wordbreakdata.c b/linebreak/linebreak/wordbreakdata.c new file mode 100644 index 0000000..c42c51b --- /dev/null +++ b/linebreak/linebreak/wordbreakdata.c @@ -0,0 +1,860 @@ +/* The content of this file is generated from: +# WordBreakProperty-6.0.0.txt +# Date: 2010-08-19, 00:48:48 GMT [MD] +*/ + +#include "linebreak.h" +#include "wordbreakdef.h" + +static struct WordBreakProperties wb_prop_default[] = { + {0x000A, 0x000A, WBP_LF}, + {0x000B, 0x000C, WBP_Newline}, + {0x000D, 0x000D, WBP_CR}, + {0x0027, 0x0027, WBP_MidNumLet}, + {0x002C, 0x002C, WBP_MidNum}, + {0x002E, 0x002E, WBP_MidNumLet}, + {0x0030, 0x0039, WBP_Numeric}, + {0x003A, 0x003A, WBP_MidLetter}, + {0x003B, 0x003B, WBP_MidNum}, + {0x0041, 0x005A, WBP_ALetter}, + {0x005F, 0x005F, WBP_ExtendNumLet}, + {0x0061, 0x007A, WBP_ALetter}, + {0x0085, 0x0085, WBP_Newline}, + {0x00AA, 0x00AA, WBP_ALetter}, + {0x00AD, 0x00AD, WBP_Format}, + {0x00B5, 0x00B5, WBP_ALetter}, + {0x00B7, 0x00B7, WBP_MidLetter}, + {0x00BA, 0x00BA, WBP_ALetter}, + {0x00C0, 0x00D6, WBP_ALetter}, + {0x00D8, 0x00F6, WBP_ALetter}, + {0x00F8, 0x01BA, WBP_ALetter}, + {0x01BB, 0x01BB, WBP_ALetter}, + {0x01BC, 0x01BF, WBP_ALetter}, + {0x01C0, 0x01C3, WBP_ALetter}, + {0x01C4, 0x0293, WBP_ALetter}, + {0x0294, 0x0294, WBP_ALetter}, + {0x0295, 0x02AF, WBP_ALetter}, + {0x02B0, 0x02C1, WBP_ALetter}, + {0x02C6, 0x02D1, WBP_ALetter}, + {0x02E0, 0x02E4, WBP_ALetter}, + {0x02EC, 0x02EC, WBP_ALetter}, + {0x02EE, 0x02EE, WBP_ALetter}, + {0x0300, 0x036F, WBP_Extend}, + {0x0370, 0x0373, WBP_ALetter}, + {0x0374, 0x0374, WBP_ALetter}, + {0x0376, 0x0377, WBP_ALetter}, + {0x037A, 0x037A, WBP_ALetter}, + {0x037B, 0x037D, WBP_ALetter}, + {0x037E, 0x037E, WBP_MidNum}, + {0x0386, 0x0386, WBP_ALetter}, + {0x0387, 0x0387, WBP_MidLetter}, + {0x0388, 0x038A, WBP_ALetter}, + {0x038C, 0x038C, WBP_ALetter}, + {0x038E, 0x03A1, WBP_ALetter}, + {0x03A3, 0x03F5, WBP_ALetter}, + {0x03F7, 0x0481, WBP_ALetter}, + {0x0483, 0x0487, WBP_Extend}, + {0x0488, 0x0489, WBP_Extend}, + {0x048A, 0x0527, WBP_ALetter}, + {0x0531, 0x0556, WBP_ALetter}, + {0x0559, 0x0559, WBP_ALetter}, + {0x0561, 0x0587, WBP_ALetter}, + {0x0589, 0x0589, WBP_MidNum}, + {0x0591, 0x05BD, WBP_Extend}, + {0x05BF, 0x05BF, WBP_Extend}, + {0x05C1, 0x05C2, WBP_Extend}, + {0x05C4, 0x05C5, WBP_Extend}, + {0x05C7, 0x05C7, WBP_Extend}, + {0x05D0, 0x05EA, WBP_ALetter}, + {0x05F0, 0x05F2, WBP_ALetter}, + {0x05F3, 0x05F3, WBP_ALetter}, + {0x05F4, 0x05F4, WBP_MidLetter}, + {0x0600, 0x0603, WBP_Format}, + {0x060C, 0x060D, WBP_MidNum}, + {0x0610, 0x061A, WBP_Extend}, + {0x0620, 0x063F, WBP_ALetter}, + {0x0640, 0x0640, WBP_ALetter}, + {0x0641, 0x064A, WBP_ALetter}, + {0x064B, 0x065F, WBP_Extend}, + {0x0660, 0x0669, WBP_Numeric}, + {0x066B, 0x066B, WBP_Numeric}, + {0x066C, 0x066C, WBP_MidNum}, + {0x066E, 0x066F, WBP_ALetter}, + {0x0670, 0x0670, WBP_Extend}, + {0x0671, 0x06D3, WBP_ALetter}, + {0x06D5, 0x06D5, WBP_ALetter}, + {0x06D6, 0x06DC, WBP_Extend}, + {0x06DD, 0x06DD, WBP_Format}, + {0x06DF, 0x06E4, WBP_Extend}, + {0x06E5, 0x06E6, WBP_ALetter}, + {0x06E7, 0x06E8, WBP_Extend}, + {0x06EA, 0x06ED, WBP_Extend}, + {0x06EE, 0x06EF, WBP_ALetter}, + {0x06F0, 0x06F9, WBP_Numeric}, + {0x06FA, 0x06FC, WBP_ALetter}, + {0x06FF, 0x06FF, WBP_ALetter}, + {0x070F, 0x070F, WBP_Format}, + {0x0710, 0x0710, WBP_ALetter}, + {0x0711, 0x0711, WBP_Extend}, + {0x0712, 0x072F, WBP_ALetter}, + {0x0730, 0x074A, WBP_Extend}, + {0x074D, 0x07A5, WBP_ALetter}, + {0x07A6, 0x07B0, WBP_Extend}, + {0x07B1, 0x07B1, WBP_ALetter}, + {0x07C0, 0x07C9, WBP_Numeric}, + {0x07CA, 0x07EA, WBP_ALetter}, + {0x07EB, 0x07F3, WBP_Extend}, + {0x07F4, 0x07F5, WBP_ALetter}, + {0x07F8, 0x07F8, WBP_MidNum}, + {0x07FA, 0x07FA, WBP_ALetter}, + {0x0800, 0x0815, WBP_ALetter}, + {0x0816, 0x0819, WBP_Extend}, + {0x081A, 0x081A, WBP_ALetter}, + {0x081B, 0x0823, WBP_Extend}, + {0x0824, 0x0824, WBP_ALetter}, + {0x0825, 0x0827, WBP_Extend}, + {0x0828, 0x0828, WBP_ALetter}, + {0x0829, 0x082D, WBP_Extend}, + {0x0840, 0x0858, WBP_ALetter}, + {0x0859, 0x085B, WBP_Extend}, + {0x0900, 0x0902, WBP_Extend}, + {0x0903, 0x0903, WBP_Extend}, + {0x0904, 0x0939, WBP_ALetter}, + {0x093A, 0x093A, WBP_Extend}, + {0x093B, 0x093B, WBP_Extend}, + {0x093C, 0x093C, WBP_Extend}, + {0x093D, 0x093D, WBP_ALetter}, + {0x093E, 0x0940, WBP_Extend}, + {0x0941, 0x0948, WBP_Extend}, + {0x0949, 0x094C, WBP_Extend}, + {0x094D, 0x094D, WBP_Extend}, + {0x094E, 0x094F, WBP_Extend}, + {0x0950, 0x0950, WBP_ALetter}, + {0x0951, 0x0957, WBP_Extend}, + {0x0958, 0x0961, WBP_ALetter}, + {0x0962, 0x0963, WBP_Extend}, + {0x0966, 0x096F, WBP_Numeric}, + {0x0971, 0x0971, WBP_ALetter}, + {0x0972, 0x0977, WBP_ALetter}, + {0x0979, 0x097F, WBP_ALetter}, + {0x0981, 0x0981, WBP_Extend}, + {0x0982, 0x0983, WBP_Extend}, + {0x0985, 0x098C, WBP_ALetter}, + {0x098F, 0x0990, WBP_ALetter}, + {0x0993, 0x09A8, WBP_ALetter}, + {0x09AA, 0x09B0, WBP_ALetter}, + {0x09B2, 0x09B2, WBP_ALetter}, + {0x09B6, 0x09B9, WBP_ALetter}, + {0x09BC, 0x09BC, WBP_Extend}, + {0x09BD, 0x09BD, WBP_ALetter}, + {0x09BE, 0x09C0, WBP_Extend}, + {0x09C1, 0x09C4, WBP_Extend}, + {0x09C7, 0x09C8, WBP_Extend}, + {0x09CB, 0x09CC, WBP_Extend}, + {0x09CD, 0x09CD, WBP_Extend}, + {0x09CE, 0x09CE, WBP_ALetter}, + {0x09D7, 0x09D7, WBP_Extend}, + {0x09DC, 0x09DD, WBP_ALetter}, + {0x09DF, 0x09E1, WBP_ALetter}, + {0x09E2, 0x09E3, WBP_Extend}, + {0x09E6, 0x09EF, WBP_Numeric}, + {0x09F0, 0x09F1, WBP_ALetter}, + {0x0A01, 0x0A02, WBP_Extend}, + {0x0A03, 0x0A03, WBP_Extend}, + {0x0A05, 0x0A0A, WBP_ALetter}, + {0x0A0F, 0x0A10, WBP_ALetter}, + {0x0A13, 0x0A28, WBP_ALetter}, + {0x0A2A, 0x0A30, WBP_ALetter}, + {0x0A32, 0x0A33, WBP_ALetter}, + {0x0A35, 0x0A36, WBP_ALetter}, + {0x0A38, 0x0A39, WBP_ALetter}, + {0x0A3C, 0x0A3C, WBP_Extend}, + {0x0A3E, 0x0A40, WBP_Extend}, + {0x0A41, 0x0A42, WBP_Extend}, + {0x0A47, 0x0A48, WBP_Extend}, + {0x0A4B, 0x0A4D, WBP_Extend}, + {0x0A51, 0x0A51, WBP_Extend}, + {0x0A59, 0x0A5C, WBP_ALetter}, + {0x0A5E, 0x0A5E, WBP_ALetter}, + {0x0A66, 0x0A6F, WBP_Numeric}, + {0x0A70, 0x0A71, WBP_Extend}, + {0x0A72, 0x0A74, WBP_ALetter}, + {0x0A75, 0x0A75, WBP_Extend}, + {0x0A81, 0x0A82, WBP_Extend}, + {0x0A83, 0x0A83, WBP_Extend}, + {0x0A85, 0x0A8D, WBP_ALetter}, + {0x0A8F, 0x0A91, WBP_ALetter}, + {0x0A93, 0x0AA8, WBP_ALetter}, + {0x0AAA, 0x0AB0, WBP_ALetter}, + {0x0AB2, 0x0AB3, WBP_ALetter}, + {0x0AB5, 0x0AB9, WBP_ALetter}, + {0x0ABC, 0x0ABC, WBP_Extend}, + {0x0ABD, 0x0ABD, WBP_ALetter}, + {0x0ABE, 0x0AC0, WBP_Extend}, + {0x0AC1, 0x0AC5, WBP_Extend}, + {0x0AC7, 0x0AC8, WBP_Extend}, + {0x0AC9, 0x0AC9, WBP_Extend}, + {0x0ACB, 0x0ACC, WBP_Extend}, + {0x0ACD, 0x0ACD, WBP_Extend}, + {0x0AD0, 0x0AD0, WBP_ALetter}, + {0x0AE0, 0x0AE1, WBP_ALetter}, + {0x0AE2, 0x0AE3, WBP_Extend}, + {0x0AE6, 0x0AEF, WBP_Numeric}, + {0x0B01, 0x0B01, WBP_Extend}, + {0x0B02, 0x0B03, WBP_Extend}, + {0x0B05, 0x0B0C, WBP_ALetter}, + {0x0B0F, 0x0B10, WBP_ALetter}, + {0x0B13, 0x0B28, WBP_ALetter}, + {0x0B2A, 0x0B30, WBP_ALetter}, + {0x0B32, 0x0B33, WBP_ALetter}, + {0x0B35, 0x0B39, WBP_ALetter}, + {0x0B3C, 0x0B3C, WBP_Extend}, + {0x0B3D, 0x0B3D, WBP_ALetter}, + {0x0B3E, 0x0B3E, WBP_Extend}, + {0x0B3F, 0x0B3F, WBP_Extend}, + {0x0B40, 0x0B40, WBP_Extend}, + {0x0B41, 0x0B44, WBP_Extend}, + {0x0B47, 0x0B48, WBP_Extend}, + {0x0B4B, 0x0B4C, WBP_Extend}, + {0x0B4D, 0x0B4D, WBP_Extend}, + {0x0B56, 0x0B56, WBP_Extend}, + {0x0B57, 0x0B57, WBP_Extend}, + {0x0B5C, 0x0B5D, WBP_ALetter}, + {0x0B5F, 0x0B61, WBP_ALetter}, + {0x0B62, 0x0B63, WBP_Extend}, + {0x0B66, 0x0B6F, WBP_Numeric}, + {0x0B71, 0x0B71, WBP_ALetter}, + {0x0B82, 0x0B82, WBP_Extend}, + {0x0B83, 0x0B83, WBP_ALetter}, + {0x0B85, 0x0B8A, WBP_ALetter}, + {0x0B8E, 0x0B90, WBP_ALetter}, + {0x0B92, 0x0B95, WBP_ALetter}, + {0x0B99, 0x0B9A, WBP_ALetter}, + {0x0B9C, 0x0B9C, WBP_ALetter}, + {0x0B9E, 0x0B9F, WBP_ALetter}, + {0x0BA3, 0x0BA4, WBP_ALetter}, + {0x0BA8, 0x0BAA, WBP_ALetter}, + {0x0BAE, 0x0BB9, WBP_ALetter}, + {0x0BBE, 0x0BBF, WBP_Extend}, + {0x0BC0, 0x0BC0, WBP_Extend}, + {0x0BC1, 0x0BC2, WBP_Extend}, + {0x0BC6, 0x0BC8, WBP_Extend}, + {0x0BCA, 0x0BCC, WBP_Extend}, + {0x0BCD, 0x0BCD, WBP_Extend}, + {0x0BD0, 0x0BD0, WBP_ALetter}, + {0x0BD7, 0x0BD7, WBP_Extend}, + {0x0BE6, 0x0BEF, WBP_Numeric}, + {0x0C01, 0x0C03, WBP_Extend}, + {0x0C05, 0x0C0C, WBP_ALetter}, + {0x0C0E, 0x0C10, WBP_ALetter}, + {0x0C12, 0x0C28, WBP_ALetter}, + {0x0C2A, 0x0C33, WBP_ALetter}, + {0x0C35, 0x0C39, WBP_ALetter}, + {0x0C3D, 0x0C3D, WBP_ALetter}, + {0x0C3E, 0x0C40, WBP_Extend}, + {0x0C41, 0x0C44, WBP_Extend}, + {0x0C46, 0x0C48, WBP_Extend}, + {0x0C4A, 0x0C4D, WBP_Extend}, + {0x0C55, 0x0C56, WBP_Extend}, + {0x0C58, 0x0C59, WBP_ALetter}, + {0x0C60, 0x0C61, WBP_ALetter}, + {0x0C62, 0x0C63, WBP_Extend}, + {0x0C66, 0x0C6F, WBP_Numeric}, + {0x0C82, 0x0C83, WBP_Extend}, + {0x0C85, 0x0C8C, WBP_ALetter}, + {0x0C8E, 0x0C90, WBP_ALetter}, + {0x0C92, 0x0CA8, WBP_ALetter}, + {0x0CAA, 0x0CB3, WBP_ALetter}, + {0x0CB5, 0x0CB9, WBP_ALetter}, + {0x0CBC, 0x0CBC, WBP_Extend}, + {0x0CBD, 0x0CBD, WBP_ALetter}, + {0x0CBE, 0x0CBE, WBP_Extend}, + {0x0CBF, 0x0CBF, WBP_Extend}, + {0x0CC0, 0x0CC4, WBP_Extend}, + {0x0CC6, 0x0CC6, WBP_Extend}, + {0x0CC7, 0x0CC8, WBP_Extend}, + {0x0CCA, 0x0CCB, WBP_Extend}, + {0x0CCC, 0x0CCD, WBP_Extend}, + {0x0CD5, 0x0CD6, WBP_Extend}, + {0x0CDE, 0x0CDE, WBP_ALetter}, + {0x0CE0, 0x0CE1, WBP_ALetter}, + {0x0CE2, 0x0CE3, WBP_Extend}, + {0x0CE6, 0x0CEF, WBP_Numeric}, + {0x0CF1, 0x0CF2, WBP_ALetter}, + {0x0D02, 0x0D03, WBP_Extend}, + {0x0D05, 0x0D0C, WBP_ALetter}, + {0x0D0E, 0x0D10, WBP_ALetter}, + {0x0D12, 0x0D3A, WBP_ALetter}, + {0x0D3D, 0x0D3D, WBP_ALetter}, + {0x0D3E, 0x0D40, WBP_Extend}, + {0x0D41, 0x0D44, WBP_Extend}, + {0x0D46, 0x0D48, WBP_Extend}, + {0x0D4A, 0x0D4C, WBP_Extend}, + {0x0D4D, 0x0D4D, WBP_Extend}, + {0x0D4E, 0x0D4E, WBP_ALetter}, + {0x0D57, 0x0D57, WBP_Extend}, + {0x0D60, 0x0D61, WBP_ALetter}, + {0x0D62, 0x0D63, WBP_Extend}, + {0x0D66, 0x0D6F, WBP_Numeric}, + {0x0D7A, 0x0D7F, WBP_ALetter}, + {0x0D82, 0x0D83, WBP_Extend}, + {0x0D85, 0x0D96, WBP_ALetter}, + {0x0D9A, 0x0DB1, WBP_ALetter}, + {0x0DB3, 0x0DBB, WBP_ALetter}, + {0x0DBD, 0x0DBD, WBP_ALetter}, + {0x0DC0, 0x0DC6, WBP_ALetter}, + {0x0DCA, 0x0DCA, WBP_Extend}, + {0x0DCF, 0x0DD1, WBP_Extend}, + {0x0DD2, 0x0DD4, WBP_Extend}, + {0x0DD6, 0x0DD6, WBP_Extend}, + {0x0DD8, 0x0DDF, WBP_Extend}, + {0x0DF2, 0x0DF3, WBP_Extend}, + {0x0E31, 0x0E31, WBP_Extend}, + {0x0E34, 0x0E3A, WBP_Extend}, + {0x0E47, 0x0E4E, WBP_Extend}, + {0x0E50, 0x0E59, WBP_Numeric}, + {0x0EB1, 0x0EB1, WBP_Extend}, + {0x0EB4, 0x0EB9, WBP_Extend}, + {0x0EBB, 0x0EBC, WBP_Extend}, + {0x0EC8, 0x0ECD, WBP_Extend}, + {0x0ED0, 0x0ED9, WBP_Numeric}, + {0x0F00, 0x0F00, WBP_ALetter}, + {0x0F18, 0x0F19, WBP_Extend}, + {0x0F20, 0x0F29, WBP_Numeric}, + {0x0F35, 0x0F35, WBP_Extend}, + {0x0F37, 0x0F37, WBP_Extend}, + {0x0F39, 0x0F39, WBP_Extend}, + {0x0F3E, 0x0F3F, WBP_Extend}, + {0x0F40, 0x0F47, WBP_ALetter}, + {0x0F49, 0x0F6C, WBP_ALetter}, + {0x0F71, 0x0F7E, WBP_Extend}, + {0x0F7F, 0x0F7F, WBP_Extend}, + {0x0F80, 0x0F84, WBP_Extend}, + {0x0F86, 0x0F87, WBP_Extend}, + {0x0F88, 0x0F8C, WBP_ALetter}, + {0x0F8D, 0x0F97, WBP_Extend}, + {0x0F99, 0x0FBC, WBP_Extend}, + {0x0FC6, 0x0FC6, WBP_Extend}, + {0x102B, 0x102C, WBP_Extend}, + {0x102D, 0x1030, WBP_Extend}, + {0x1031, 0x1031, WBP_Extend}, + {0x1032, 0x1037, WBP_Extend}, + {0x1038, 0x1038, WBP_Extend}, + {0x1039, 0x103A, WBP_Extend}, + {0x103B, 0x103C, WBP_Extend}, + {0x103D, 0x103E, WBP_Extend}, + {0x1040, 0x1049, WBP_Numeric}, + {0x1056, 0x1057, WBP_Extend}, + {0x1058, 0x1059, WBP_Extend}, + {0x105E, 0x1060, WBP_Extend}, + {0x1062, 0x1064, WBP_Extend}, + {0x1067, 0x106D, WBP_Extend}, + {0x1071, 0x1074, WBP_Extend}, + {0x1082, 0x1082, WBP_Extend}, + {0x1083, 0x1084, WBP_Extend}, + {0x1085, 0x1086, WBP_Extend}, + {0x1087, 0x108C, WBP_Extend}, + {0x108D, 0x108D, WBP_Extend}, + {0x108F, 0x108F, WBP_Extend}, + {0x1090, 0x1099, WBP_Numeric}, + {0x109A, 0x109C, WBP_Extend}, + {0x109D, 0x109D, WBP_Extend}, + {0x10A0, 0x10C5, WBP_ALetter}, + {0x10D0, 0x10FA, WBP_ALetter}, + {0x10FC, 0x10FC, WBP_ALetter}, + {0x1100, 0x1248, WBP_ALetter}, + {0x124A, 0x124D, WBP_ALetter}, + {0x1250, 0x1256, WBP_ALetter}, + {0x1258, 0x1258, WBP_ALetter}, + {0x125A, 0x125D, WBP_ALetter}, + {0x1260, 0x1288, WBP_ALetter}, + {0x128A, 0x128D, WBP_ALetter}, + {0x1290, 0x12B0, WBP_ALetter}, + {0x12B2, 0x12B5, WBP_ALetter}, + {0x12B8, 0x12BE, WBP_ALetter}, + {0x12C0, 0x12C0, WBP_ALetter}, + {0x12C2, 0x12C5, WBP_ALetter}, + {0x12C8, 0x12D6, WBP_ALetter}, + {0x12D8, 0x1310, WBP_ALetter}, + {0x1312, 0x1315, WBP_ALetter}, + {0x1318, 0x135A, WBP_ALetter}, + {0x135D, 0x135F, WBP_Extend}, + {0x1380, 0x138F, WBP_ALetter}, + {0x13A0, 0x13F4, WBP_ALetter}, + {0x1401, 0x166C, WBP_ALetter}, + {0x166F, 0x167F, WBP_ALetter}, + {0x1681, 0x169A, WBP_ALetter}, + {0x16A0, 0x16EA, WBP_ALetter}, + {0x16EE, 0x16F0, WBP_ALetter}, + {0x1700, 0x170C, WBP_ALetter}, + {0x170E, 0x1711, WBP_ALetter}, + {0x1712, 0x1714, WBP_Extend}, + {0x1720, 0x1731, WBP_ALetter}, + {0x1732, 0x1734, WBP_Extend}, + {0x1740, 0x1751, WBP_ALetter}, + {0x1752, 0x1753, WBP_Extend}, + {0x1760, 0x176C, WBP_ALetter}, + {0x176E, 0x1770, WBP_ALetter}, + {0x1772, 0x1773, WBP_Extend}, + {0x17B4, 0x17B5, WBP_Format}, + {0x17B6, 0x17B6, WBP_Extend}, + {0x17B7, 0x17BD, WBP_Extend}, + {0x17BE, 0x17C5, WBP_Extend}, + {0x17C6, 0x17C6, WBP_Extend}, + {0x17C7, 0x17C8, WBP_Extend}, + {0x17C9, 0x17D3, WBP_Extend}, + {0x17DD, 0x17DD, WBP_Extend}, + {0x17E0, 0x17E9, WBP_Numeric}, + {0x180B, 0x180D, WBP_Extend}, + {0x1810, 0x1819, WBP_Numeric}, + {0x1820, 0x1842, WBP_ALetter}, + {0x1843, 0x1843, WBP_ALetter}, + {0x1844, 0x1877, WBP_ALetter}, + {0x1880, 0x18A8, WBP_ALetter}, + {0x18A9, 0x18A9, WBP_Extend}, + {0x18AA, 0x18AA, WBP_ALetter}, + {0x18B0, 0x18F5, WBP_ALetter}, + {0x1900, 0x191C, WBP_ALetter}, + {0x1920, 0x1922, WBP_Extend}, + {0x1923, 0x1926, WBP_Extend}, + {0x1927, 0x1928, WBP_Extend}, + {0x1929, 0x192B, WBP_Extend}, + {0x1930, 0x1931, WBP_Extend}, + {0x1932, 0x1932, WBP_Extend}, + {0x1933, 0x1938, WBP_Extend}, + {0x1939, 0x193B, WBP_Extend}, + {0x1946, 0x194F, WBP_Numeric}, + {0x19B0, 0x19C0, WBP_Extend}, + {0x19C8, 0x19C9, WBP_Extend}, + {0x19D0, 0x19D9, WBP_Numeric}, + {0x1A00, 0x1A16, WBP_ALetter}, + {0x1A17, 0x1A18, WBP_Extend}, + {0x1A19, 0x1A1B, WBP_Extend}, + {0x1A55, 0x1A55, WBP_Extend}, + {0x1A56, 0x1A56, WBP_Extend}, + {0x1A57, 0x1A57, WBP_Extend}, + {0x1A58, 0x1A5E, WBP_Extend}, + {0x1A60, 0x1A60, WBP_Extend}, + {0x1A61, 0x1A61, WBP_Extend}, + {0x1A62, 0x1A62, WBP_Extend}, + {0x1A63, 0x1A64, WBP_Extend}, + {0x1A65, 0x1A6C, WBP_Extend}, + {0x1A6D, 0x1A72, WBP_Extend}, + {0x1A73, 0x1A7C, WBP_Extend}, + {0x1A7F, 0x1A7F, WBP_Extend}, + {0x1A80, 0x1A89, WBP_Numeric}, + {0x1A90, 0x1A99, WBP_Numeric}, + {0x1B00, 0x1B03, WBP_Extend}, + {0x1B04, 0x1B04, WBP_Extend}, + {0x1B05, 0x1B33, WBP_ALetter}, + {0x1B34, 0x1B34, WBP_Extend}, + {0x1B35, 0x1B35, WBP_Extend}, + {0x1B36, 0x1B3A, WBP_Extend}, + {0x1B3B, 0x1B3B, WBP_Extend}, + {0x1B3C, 0x1B3C, WBP_Extend}, + {0x1B3D, 0x1B41, WBP_Extend}, + {0x1B42, 0x1B42, WBP_Extend}, + {0x1B43, 0x1B44, WBP_Extend}, + {0x1B45, 0x1B4B, WBP_ALetter}, + {0x1B50, 0x1B59, WBP_Numeric}, + {0x1B6B, 0x1B73, WBP_Extend}, + {0x1B80, 0x1B81, WBP_Extend}, + {0x1B82, 0x1B82, WBP_Extend}, + {0x1B83, 0x1BA0, WBP_ALetter}, + {0x1BA1, 0x1BA1, WBP_Extend}, + {0x1BA2, 0x1BA5, WBP_Extend}, + {0x1BA6, 0x1BA7, WBP_Extend}, + {0x1BA8, 0x1BA9, WBP_Extend}, + {0x1BAA, 0x1BAA, WBP_Extend}, + {0x1BAE, 0x1BAF, WBP_ALetter}, + {0x1BB0, 0x1BB9, WBP_Numeric}, + {0x1BC0, 0x1BE5, WBP_ALetter}, + {0x1BE6, 0x1BE6, WBP_Extend}, + {0x1BE7, 0x1BE7, WBP_Extend}, + {0x1BE8, 0x1BE9, WBP_Extend}, + {0x1BEA, 0x1BEC, WBP_Extend}, + {0x1BED, 0x1BED, WBP_Extend}, + {0x1BEE, 0x1BEE, WBP_Extend}, + {0x1BEF, 0x1BF1, WBP_Extend}, + {0x1BF2, 0x1BF3, WBP_Extend}, + {0x1C00, 0x1C23, WBP_ALetter}, + {0x1C24, 0x1C2B, WBP_Extend}, + {0x1C2C, 0x1C33, WBP_Extend}, + {0x1C34, 0x1C35, WBP_Extend}, + {0x1C36, 0x1C37, WBP_Extend}, + {0x1C40, 0x1C49, WBP_Numeric}, + {0x1C4D, 0x1C4F, WBP_ALetter}, + {0x1C50, 0x1C59, WBP_Numeric}, + {0x1C5A, 0x1C77, WBP_ALetter}, + {0x1C78, 0x1C7D, WBP_ALetter}, + {0x1CD0, 0x1CD2, WBP_Extend}, + {0x1CD4, 0x1CE0, WBP_Extend}, + {0x1CE1, 0x1CE1, WBP_Extend}, + {0x1CE2, 0x1CE8, WBP_Extend}, + {0x1CE9, 0x1CEC, WBP_ALetter}, + {0x1CED, 0x1CED, WBP_Extend}, + {0x1CEE, 0x1CF1, WBP_ALetter}, + {0x1CF2, 0x1CF2, WBP_Extend}, + {0x1D00, 0x1D2B, WBP_ALetter}, + {0x1D2C, 0x1D61, WBP_ALetter}, + {0x1D62, 0x1D77, WBP_ALetter}, + {0x1D78, 0x1D78, WBP_ALetter}, + {0x1D79, 0x1D9A, WBP_ALetter}, + {0x1D9B, 0x1DBF, WBP_ALetter}, + {0x1DC0, 0x1DE6, WBP_Extend}, + {0x1DFC, 0x1DFF, WBP_Extend}, + {0x1E00, 0x1F15, WBP_ALetter}, + {0x1F18, 0x1F1D, WBP_ALetter}, + {0x1F20, 0x1F45, WBP_ALetter}, + {0x1F48, 0x1F4D, WBP_ALetter}, + {0x1F50, 0x1F57, WBP_ALetter}, + {0x1F59, 0x1F59, WBP_ALetter}, + {0x1F5B, 0x1F5B, WBP_ALetter}, + {0x1F5D, 0x1F5D, WBP_ALetter}, + {0x1F5F, 0x1F7D, WBP_ALetter}, + {0x1F80, 0x1FB4, WBP_ALetter}, + {0x1FB6, 0x1FBC, WBP_ALetter}, + {0x1FBE, 0x1FBE, WBP_ALetter}, + {0x1FC2, 0x1FC4, WBP_ALetter}, + {0x1FC6, 0x1FCC, WBP_ALetter}, + {0x1FD0, 0x1FD3, WBP_ALetter}, + {0x1FD6, 0x1FDB, WBP_ALetter}, + {0x1FE0, 0x1FEC, WBP_ALetter}, + {0x1FF2, 0x1FF4, WBP_ALetter}, + {0x1FF6, 0x1FFC, WBP_ALetter}, + {0x200C, 0x200D, WBP_Extend}, + {0x200E, 0x200F, WBP_Format}, + {0x2018, 0x2018, WBP_MidNumLet}, + {0x2019, 0x2019, WBP_MidNumLet}, + {0x2024, 0x2024, WBP_MidNumLet}, + {0x2027, 0x2027, WBP_MidLetter}, + {0x2028, 0x2028, WBP_Newline}, + {0x2029, 0x2029, WBP_Newline}, + {0x202A, 0x202E, WBP_Format}, + {0x203F, 0x2040, WBP_ExtendNumLet}, + {0x2044, 0x2044, WBP_MidNum}, + {0x2054, 0x2054, WBP_ExtendNumLet}, + {0x2060, 0x2064, WBP_Format}, + {0x206A, 0x206F, WBP_Format}, + {0x2071, 0x2071, WBP_ALetter}, + {0x207F, 0x207F, WBP_ALetter}, + {0x2090, 0x209C, WBP_ALetter}, + {0x20D0, 0x20DC, WBP_Extend}, + {0x20DD, 0x20E0, WBP_Extend}, + {0x20E1, 0x20E1, WBP_Extend}, + {0x20E2, 0x20E4, WBP_Extend}, + {0x20E5, 0x20F0, WBP_Extend}, + {0x2102, 0x2102, WBP_ALetter}, + {0x2107, 0x2107, WBP_ALetter}, + {0x210A, 0x2113, WBP_ALetter}, + {0x2115, 0x2115, WBP_ALetter}, + {0x2119, 0x211D, WBP_ALetter}, + {0x2124, 0x2124, WBP_ALetter}, + {0x2126, 0x2126, WBP_ALetter}, + {0x2128, 0x2128, WBP_ALetter}, + {0x212A, 0x212D, WBP_ALetter}, + {0x212F, 0x2134, WBP_ALetter}, + {0x2135, 0x2138, WBP_ALetter}, + {0x2139, 0x2139, WBP_ALetter}, + {0x213C, 0x213F, WBP_ALetter}, + {0x2145, 0x2149, WBP_ALetter}, + {0x214E, 0x214E, WBP_ALetter}, + {0x2160, 0x2182, WBP_ALetter}, + {0x2183, 0x2184, WBP_ALetter}, + {0x2185, 0x2188, WBP_ALetter}, + {0x24B6, 0x24E9, WBP_ALetter}, + {0x2C00, 0x2C2E, WBP_ALetter}, + {0x2C30, 0x2C5E, WBP_ALetter}, + {0x2C60, 0x2C7C, WBP_ALetter}, + {0x2C7D, 0x2C7D, WBP_ALetter}, + {0x2C7E, 0x2CE4, WBP_ALetter}, + {0x2CEB, 0x2CEE, WBP_ALetter}, + {0x2CEF, 0x2CF1, WBP_Extend}, + {0x2D00, 0x2D25, WBP_ALetter}, + {0x2D30, 0x2D65, WBP_ALetter}, + {0x2D6F, 0x2D6F, WBP_ALetter}, + {0x2D7F, 0x2D7F, WBP_Extend}, + {0x2D80, 0x2D96, WBP_ALetter}, + {0x2DA0, 0x2DA6, WBP_ALetter}, + {0x2DA8, 0x2DAE, WBP_ALetter}, + {0x2DB0, 0x2DB6, WBP_ALetter}, + {0x2DB8, 0x2DBE, WBP_ALetter}, + {0x2DC0, 0x2DC6, WBP_ALetter}, + {0x2DC8, 0x2DCE, WBP_ALetter}, + {0x2DD0, 0x2DD6, WBP_ALetter}, + {0x2DD8, 0x2DDE, WBP_ALetter}, + {0x2DE0, 0x2DFF, WBP_Extend}, + {0x2E2F, 0x2E2F, WBP_ALetter}, + {0x3005, 0x3005, WBP_ALetter}, + {0x302A, 0x302F, WBP_Extend}, + {0x3031, 0x3035, WBP_Katakana}, + {0x303B, 0x303B, WBP_ALetter}, + {0x303C, 0x303C, WBP_ALetter}, + {0x3099, 0x309A, WBP_Extend}, + {0x309B, 0x309C, WBP_Katakana}, + {0x30A0, 0x30A0, WBP_Katakana}, + {0x30A1, 0x30FA, WBP_Katakana}, + {0x30FC, 0x30FE, WBP_Katakana}, + {0x30FF, 0x30FF, WBP_Katakana}, + {0x3105, 0x312D, WBP_ALetter}, + {0x3131, 0x318E, WBP_ALetter}, + {0x31A0, 0x31BA, WBP_ALetter}, + {0x31F0, 0x31FF, WBP_Katakana}, + {0x32D0, 0x32FE, WBP_Katakana}, + {0x3300, 0x3357, WBP_Katakana}, + {0xA000, 0xA014, WBP_ALetter}, + {0xA015, 0xA015, WBP_ALetter}, + {0xA016, 0xA48C, WBP_ALetter}, + {0xA4D0, 0xA4F7, WBP_ALetter}, + {0xA4F8, 0xA4FD, WBP_ALetter}, + {0xA500, 0xA60B, WBP_ALetter}, + {0xA60C, 0xA60C, WBP_ALetter}, + {0xA610, 0xA61F, WBP_ALetter}, + {0xA620, 0xA629, WBP_Numeric}, + {0xA62A, 0xA62B, WBP_ALetter}, + {0xA640, 0xA66D, WBP_ALetter}, + {0xA66E, 0xA66E, WBP_ALetter}, + {0xA66F, 0xA66F, WBP_Extend}, + {0xA670, 0xA672, WBP_Extend}, + {0xA67C, 0xA67D, WBP_Extend}, + {0xA67F, 0xA67F, WBP_ALetter}, + {0xA680, 0xA697, WBP_ALetter}, + {0xA6A0, 0xA6E5, WBP_ALetter}, + {0xA6E6, 0xA6EF, WBP_ALetter}, + {0xA6F0, 0xA6F1, WBP_Extend}, + {0xA717, 0xA71F, WBP_ALetter}, + {0xA722, 0xA76F, WBP_ALetter}, + {0xA770, 0xA770, WBP_ALetter}, + {0xA771, 0xA787, WBP_ALetter}, + {0xA788, 0xA788, WBP_ALetter}, + {0xA78B, 0xA78E, WBP_ALetter}, + {0xA790, 0xA791, WBP_ALetter}, + {0xA7A0, 0xA7A9, WBP_ALetter}, + {0xA7FA, 0xA7FA, WBP_ALetter}, + {0xA7FB, 0xA801, WBP_ALetter}, + {0xA802, 0xA802, WBP_Extend}, + {0xA803, 0xA805, WBP_ALetter}, + {0xA806, 0xA806, WBP_Extend}, + {0xA807, 0xA80A, WBP_ALetter}, + {0xA80B, 0xA80B, WBP_Extend}, + {0xA80C, 0xA822, WBP_ALetter}, + {0xA823, 0xA824, WBP_Extend}, + {0xA825, 0xA826, WBP_Extend}, + {0xA827, 0xA827, WBP_Extend}, + {0xA840, 0xA873, WBP_ALetter}, + {0xA880, 0xA881, WBP_Extend}, + {0xA882, 0xA8B3, WBP_ALetter}, + {0xA8B4, 0xA8C3, WBP_Extend}, + {0xA8C4, 0xA8C4, WBP_Extend}, + {0xA8D0, 0xA8D9, WBP_Numeric}, + {0xA8E0, 0xA8F1, WBP_Extend}, + {0xA8F2, 0xA8F7, WBP_ALetter}, + {0xA8FB, 0xA8FB, WBP_ALetter}, + {0xA900, 0xA909, WBP_Numeric}, + {0xA90A, 0xA925, WBP_ALetter}, + {0xA926, 0xA92D, WBP_Extend}, + {0xA930, 0xA946, WBP_ALetter}, + {0xA947, 0xA951, WBP_Extend}, + {0xA952, 0xA953, WBP_Extend}, + {0xA960, 0xA97C, WBP_ALetter}, + {0xA980, 0xA982, WBP_Extend}, + {0xA983, 0xA983, WBP_Extend}, + {0xA984, 0xA9B2, WBP_ALetter}, + {0xA9B3, 0xA9B3, WBP_Extend}, + {0xA9B4, 0xA9B5, WBP_Extend}, + {0xA9B6, 0xA9B9, WBP_Extend}, + {0xA9BA, 0xA9BB, WBP_Extend}, + {0xA9BC, 0xA9BC, WBP_Extend}, + {0xA9BD, 0xA9C0, WBP_Extend}, + {0xA9CF, 0xA9CF, WBP_ALetter}, + {0xA9D0, 0xA9D9, WBP_Numeric}, + {0xAA00, 0xAA28, WBP_ALetter}, + {0xAA29, 0xAA2E, WBP_Extend}, + {0xAA2F, 0xAA30, WBP_Extend}, + {0xAA31, 0xAA32, WBP_Extend}, + {0xAA33, 0xAA34, WBP_Extend}, + {0xAA35, 0xAA36, WBP_Extend}, + {0xAA40, 0xAA42, WBP_ALetter}, + {0xAA43, 0xAA43, WBP_Extend}, + {0xAA44, 0xAA4B, WBP_ALetter}, + {0xAA4C, 0xAA4C, WBP_Extend}, + {0xAA4D, 0xAA4D, WBP_Extend}, + {0xAA50, 0xAA59, WBP_Numeric}, + {0xAA7B, 0xAA7B, WBP_Extend}, + {0xAAB0, 0xAAB0, WBP_Extend}, + {0xAAB2, 0xAAB4, WBP_Extend}, + {0xAAB7, 0xAAB8, WBP_Extend}, + {0xAABE, 0xAABF, WBP_Extend}, + {0xAAC1, 0xAAC1, WBP_Extend}, + {0xAB01, 0xAB06, WBP_ALetter}, + {0xAB09, 0xAB0E, WBP_ALetter}, + {0xAB11, 0xAB16, WBP_ALetter}, + {0xAB20, 0xAB26, WBP_ALetter}, + {0xAB28, 0xAB2E, WBP_ALetter}, + {0xABC0, 0xABE2, WBP_ALetter}, + {0xABE3, 0xABE4, WBP_Extend}, + {0xABE5, 0xABE5, WBP_Extend}, + {0xABE6, 0xABE7, WBP_Extend}, + {0xABE8, 0xABE8, WBP_Extend}, + {0xABE9, 0xABEA, WBP_Extend}, + {0xABEC, 0xABEC, WBP_Extend}, + {0xABED, 0xABED, WBP_Extend}, + {0xABF0, 0xABF9, WBP_Numeric}, + {0xAC00, 0xD7A3, WBP_ALetter}, + {0xD7B0, 0xD7C6, WBP_ALetter}, + {0xD7CB, 0xD7FB, WBP_ALetter}, + {0xFB00, 0xFB06, WBP_ALetter}, + {0xFB13, 0xFB17, WBP_ALetter}, + {0xFB1D, 0xFB1D, WBP_ALetter}, + {0xFB1E, 0xFB1E, WBP_Extend}, + {0xFB1F, 0xFB28, WBP_ALetter}, + {0xFB2A, 0xFB36, WBP_ALetter}, + {0xFB38, 0xFB3C, WBP_ALetter}, + {0xFB3E, 0xFB3E, WBP_ALetter}, + {0xFB40, 0xFB41, WBP_ALetter}, + {0xFB43, 0xFB44, WBP_ALetter}, + {0xFB46, 0xFBB1, WBP_ALetter}, + {0xFBD3, 0xFD3D, WBP_ALetter}, + {0xFD50, 0xFD8F, WBP_ALetter}, + {0xFD92, 0xFDC7, WBP_ALetter}, + {0xFDF0, 0xFDFB, WBP_ALetter}, + {0xFE00, 0xFE0F, WBP_Extend}, + {0xFE10, 0xFE10, WBP_MidNum}, + {0xFE13, 0xFE13, WBP_MidLetter}, + {0xFE14, 0xFE14, WBP_MidNum}, + {0xFE20, 0xFE26, WBP_Extend}, + {0xFE33, 0xFE34, WBP_ExtendNumLet}, + {0xFE4D, 0xFE4F, WBP_ExtendNumLet}, + {0xFE50, 0xFE50, WBP_MidNum}, + {0xFE52, 0xFE52, WBP_MidNumLet}, + {0xFE54, 0xFE54, WBP_MidNum}, + {0xFE55, 0xFE55, WBP_MidLetter}, + {0xFE70, 0xFE74, WBP_ALetter}, + {0xFE76, 0xFEFC, WBP_ALetter}, + {0xFEFF, 0xFEFF, WBP_Format}, + {0xFF07, 0xFF07, WBP_MidNumLet}, + {0xFF0C, 0xFF0C, WBP_MidNum}, + {0xFF0E, 0xFF0E, WBP_MidNumLet}, + {0xFF1A, 0xFF1A, WBP_MidLetter}, + {0xFF1B, 0xFF1B, WBP_MidNum}, + {0xFF21, 0xFF3A, WBP_ALetter}, + {0xFF3F, 0xFF3F, WBP_ExtendNumLet}, + {0xFF41, 0xFF5A, WBP_ALetter}, + {0xFF66, 0xFF6F, WBP_Katakana}, + {0xFF70, 0xFF70, WBP_Katakana}, + {0xFF71, 0xFF9D, WBP_Katakana}, + {0xFF9E, 0xFF9F, WBP_Extend}, + {0xFFA0, 0xFFBE, WBP_ALetter}, + {0xFFC2, 0xFFC7, WBP_ALetter}, + {0xFFCA, 0xFFCF, WBP_ALetter}, + {0xFFD2, 0xFFD7, WBP_ALetter}, + {0xFFDA, 0xFFDC, WBP_ALetter}, + {0xFFF9, 0xFFFB, WBP_Format}, + {0x10000, 0x1000B, WBP_ALetter}, + {0x1000D, 0x10026, WBP_ALetter}, + {0x10028, 0x1003A, WBP_ALetter}, + {0x1003C, 0x1003D, WBP_ALetter}, + {0x1003F, 0x1004D, WBP_ALetter}, + {0x10050, 0x1005D, WBP_ALetter}, + {0x10080, 0x100FA, WBP_ALetter}, + {0x10140, 0x10174, WBP_ALetter}, + {0x101FD, 0x101FD, WBP_Extend}, + {0x10280, 0x1029C, WBP_ALetter}, + {0x102A0, 0x102D0, WBP_ALetter}, + {0x10300, 0x1031E, WBP_ALetter}, + {0x10330, 0x10340, WBP_ALetter}, + {0x10341, 0x10341, WBP_ALetter}, + {0x10342, 0x10349, WBP_ALetter}, + {0x1034A, 0x1034A, WBP_ALetter}, + {0x10380, 0x1039D, WBP_ALetter}, + {0x103A0, 0x103C3, WBP_ALetter}, + {0x103C8, 0x103CF, WBP_ALetter}, + {0x103D1, 0x103D5, WBP_ALetter}, + {0x10400, 0x1044F, WBP_ALetter}, + {0x10450, 0x1049D, WBP_ALetter}, + {0x104A0, 0x104A9, WBP_Numeric}, + {0x10800, 0x10805, WBP_ALetter}, + {0x10808, 0x10808, WBP_ALetter}, + {0x1080A, 0x10835, WBP_ALetter}, + {0x10837, 0x10838, WBP_ALetter}, + {0x1083C, 0x1083C, WBP_ALetter}, + {0x1083F, 0x10855, WBP_ALetter}, + {0x10900, 0x10915, WBP_ALetter}, + {0x10920, 0x10939, WBP_ALetter}, + {0x10A00, 0x10A00, WBP_ALetter}, + {0x10A01, 0x10A03, WBP_Extend}, + {0x10A05, 0x10A06, WBP_Extend}, + {0x10A0C, 0x10A0F, WBP_Extend}, + {0x10A10, 0x10A13, WBP_ALetter}, + {0x10A15, 0x10A17, WBP_ALetter}, + {0x10A19, 0x10A33, WBP_ALetter}, + {0x10A38, 0x10A3A, WBP_Extend}, + {0x10A3F, 0x10A3F, WBP_Extend}, + {0x10A60, 0x10A7C, WBP_ALetter}, + {0x10B00, 0x10B35, WBP_ALetter}, + {0x10B40, 0x10B55, WBP_ALetter}, + {0x10B60, 0x10B72, WBP_ALetter}, + {0x10C00, 0x10C48, WBP_ALetter}, + {0x11000, 0x11000, WBP_Extend}, + {0x11001, 0x11001, WBP_Extend}, + {0x11002, 0x11002, WBP_Extend}, + {0x11003, 0x11037, WBP_ALetter}, + {0x11038, 0x11046, WBP_Extend}, + {0x11066, 0x1106F, WBP_Numeric}, + {0x11080, 0x11081, WBP_Extend}, + {0x11082, 0x11082, WBP_Extend}, + {0x11083, 0x110AF, WBP_ALetter}, + {0x110B0, 0x110B2, WBP_Extend}, + {0x110B3, 0x110B6, WBP_Extend}, + {0x110B7, 0x110B8, WBP_Extend}, + {0x110B9, 0x110BA, WBP_Extend}, + {0x110BD, 0x110BD, WBP_Format}, + {0x12000, 0x1236E, WBP_ALetter}, + {0x12400, 0x12462, WBP_ALetter}, + {0x13000, 0x1342E, WBP_ALetter}, + {0x16800, 0x16A38, WBP_ALetter}, + {0x1B000, 0x1B000, WBP_Katakana}, + {0x1D165, 0x1D166, WBP_Extend}, + {0x1D167, 0x1D169, WBP_Extend}, + {0x1D16D, 0x1D172, WBP_Extend}, + {0x1D173, 0x1D17A, WBP_Format}, + {0x1D17B, 0x1D182, WBP_Extend}, + {0x1D185, 0x1D18B, WBP_Extend}, + {0x1D1AA, 0x1D1AD, WBP_Extend}, + {0x1D242, 0x1D244, WBP_Extend}, + {0x1D400, 0x1D454, WBP_ALetter}, + {0x1D456, 0x1D49C, WBP_ALetter}, + {0x1D49E, 0x1D49F, WBP_ALetter}, + {0x1D4A2, 0x1D4A2, WBP_ALetter}, + {0x1D4A5, 0x1D4A6, WBP_ALetter}, + {0x1D4A9, 0x1D4AC, WBP_ALetter}, + {0x1D4AE, 0x1D4B9, WBP_ALetter}, + {0x1D4BB, 0x1D4BB, WBP_ALetter}, + {0x1D4BD, 0x1D4C3, WBP_ALetter}, + {0x1D4C5, 0x1D505, WBP_ALetter}, + {0x1D507, 0x1D50A, WBP_ALetter}, + {0x1D50D, 0x1D514, WBP_ALetter}, + {0x1D516, 0x1D51C, WBP_ALetter}, + {0x1D51E, 0x1D539, WBP_ALetter}, + {0x1D53B, 0x1D53E, WBP_ALetter}, + {0x1D540, 0x1D544, WBP_ALetter}, + {0x1D546, 0x1D546, WBP_ALetter}, + {0x1D54A, 0x1D550, WBP_ALetter}, + {0x1D552, 0x1D6A5, WBP_ALetter}, + {0x1D6A8, 0x1D6C0, WBP_ALetter}, + {0x1D6C2, 0x1D6DA, WBP_ALetter}, + {0x1D6DC, 0x1D6FA, WBP_ALetter}, + {0x1D6FC, 0x1D714, WBP_ALetter}, + {0x1D716, 0x1D734, WBP_ALetter}, + {0x1D736, 0x1D74E, WBP_ALetter}, + {0x1D750, 0x1D76E, WBP_ALetter}, + {0x1D770, 0x1D788, WBP_ALetter}, + {0x1D78A, 0x1D7A8, WBP_ALetter}, + {0x1D7AA, 0x1D7C2, WBP_ALetter}, + {0x1D7C4, 0x1D7CB, WBP_ALetter}, + {0x1D7CE, 0x1D7FF, WBP_Numeric}, + {0xE0001, 0xE0001, WBP_Format}, + {0xE0020, 0xE007F, WBP_Format}, + {0xE0100, 0xE01EF, WBP_Extend}, + {0xFFFFFFFF, 0xFFFFFFFF, WBP_Undefined} +}; diff --git a/linebreak/linebreak/wordbreakdata1.tmpl b/linebreak/linebreak/wordbreakdata1.tmpl new file mode 100644 index 0000000..94f1b0e --- /dev/null +++ b/linebreak/linebreak/wordbreakdata1.tmpl @@ -0,0 +1,5 @@ + +#include "linebreak.h" +#include "wordbreakdef.h" + +static struct WordBreakProperties wb_prop_default[] = { diff --git a/linebreak/linebreak/wordbreakdata2.tmpl b/linebreak/linebreak/wordbreakdata2.tmpl new file mode 100644 index 0000000..10f154e --- /dev/null +++ b/linebreak/linebreak/wordbreakdata2.tmpl @@ -0,0 +1,2 @@ + {0xFFFFFFFF, 0xFFFFFFFF, WBP_Undefined} +}; diff --git a/linebreak/linebreak/wordbreakdef.h b/linebreak/linebreak/wordbreakdef.h new file mode 100644 index 0000000..0b4b165 --- /dev/null +++ b/linebreak/linebreak/wordbreakdef.h @@ -0,0 +1,78 @@ +/* vim: set tabstop=4 shiftwidth=4: */ + +/* + * Word breaking in a Unicode sequence. Designed to be used in a + * generic text renderer. + * + * Copyright (C) 2012 Tom Hacohen + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute + * it freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must + * not claim that you wrote the original software. If you use this + * software in a product, an acknowledgement in the product + * documentation would be appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must + * not be misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source + * distribution. + * + * The main reference is Unicode Standard Annex 29 (UAX #29): + * + * + * When this library was designed, this annex was at Revision 17, for + * Unicode 6.0.0: + * + * + * The Unicode Terms of Use are available at + * + */ + +/** + * @file wordbreakdef.h + * + * Definitions of internal data structures, declarations of global + * variables, and function prototypes for the word breaking algorithm. + * + * @version 2.1, 2012/01/18 + * @author Tom Hacohen + */ + +/** + * Word break classes. This is a direct mapping of Table 3 of Unicode + * Standard Annex 29, Revision 17. + */ +enum WordBreakClass +{ + WBP_Undefined, + WBP_CR, + WBP_LF, + WBP_Newline, + WBP_Extend, + WBP_Format, + WBP_Katakana, + WBP_ALetter, + WBP_MidNumLet, + WBP_MidLetter, + WBP_MidNum, + WBP_Numeric, + WBP_ExtendNumLet, + WBP_Any +}; + +/** + * Struct for entries of word break properties. The array of the + * entries \e must be sorted. + */ +struct WordBreakProperties +{ + utf32_t start; /**< Starting coding point */ + utf32_t end; /**< End coding point */ + enum WordBreakClass prop; /**< The word breaking property */ +};