437 lines
11 KiB
C
437 lines
11 KiB
C
/* vim: set tabstop=4 shiftwidth=4: */
|
|
|
|
/*
|
|
* Word breaking in a Unicode sequence. Designed to be used in a
|
|
* generic text renderer.
|
|
*
|
|
* Copyright (C) 2012 Tom Hacohen <tom@stosb.com>
|
|
*
|
|
* This software is provided 'as-is', without any express or implied
|
|
* warranty. In no event will the author be held liable for any damages
|
|
* arising from the use of this software.
|
|
*
|
|
* Permission is granted to anyone to use this software for any purpose,
|
|
* including commercial applications, and to alter it and redistribute
|
|
* it freely, subject to the following restrictions:
|
|
*
|
|
* 1. The origin of this software must not be misrepresented; you must
|
|
* not claim that you wrote the original software. If you use this
|
|
* software in a product, an acknowledgement in the product
|
|
* documentation would be appreciated but is not required.
|
|
* 2. Altered source versions must be plainly marked as such, and must
|
|
* not be misrepresented as being the original software.
|
|
* 3. This notice may not be removed or altered from any source
|
|
* distribution.
|
|
*
|
|
* The main reference is Unicode Standard Annex 29 (UAX #29):
|
|
* <URL:http://unicode.org/reports/tr29>
|
|
*
|
|
* When this library was designed, this annex was at Revision 17, for
|
|
* Unicode 6.0.0:
|
|
* <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
|
|
*
|
|
* The Unicode Terms of Use are available at
|
|
* <URL:http://www.unicode.org/copyright.html>
|
|
*/
|
|
|
|
/**
|
|
* @file wordbreak.c
|
|
*
|
|
* Implementation of the word breaking algorithm as described in Unicode
|
|
* Standard Annex 29.
|
|
*
|
|
* @version 2.2, 2012/02/04
|
|
* @author Tom Hacohen
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <stddef.h>
|
|
#include <string.h>
|
|
#include "linebreak.h"
|
|
#include "linebreakdef.h"
|
|
|
|
#include "wordbreak.h"
|
|
#include "wordbreakdata.c"
|
|
|
|
#define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))
|
|
|
|
/**
|
|
* Initializes the wordbreak internals. It currently does nothing, but
|
|
* it may in the future.
|
|
*/
|
|
void init_wordbreak(void)
|
|
{
|
|
}
|
|
|
|
/**
|
|
* Gets the word breaking class of a character.
|
|
*
|
|
* @param ch character to check
|
|
* @param wbp pointer to the wbp breaking properties array
|
|
* @param len size of the wbp array in number of items
|
|
* @return the word breaking class if found; \c WBP_Any otherwise
|
|
*/
|
|
static enum WordBreakClass get_char_wb_class(
|
|
utf32_t ch,
|
|
struct WordBreakProperties *wbp,
|
|
size_t len)
|
|
{
|
|
int min = 0;
|
|
int max = len - 1;
|
|
int mid;
|
|
|
|
do
|
|
{
|
|
mid = (min + max) / 2;
|
|
|
|
if (ch < wbp[mid].start)
|
|
max = mid - 1;
|
|
else if (ch > wbp[mid].end)
|
|
min = mid + 1;
|
|
else
|
|
return wbp[mid].prop;
|
|
}
|
|
while (min <= max);
|
|
|
|
return WBP_Any;
|
|
}
|
|
|
|
/**
|
|
* Sets the word break types to a specific value in a range.
|
|
*
|
|
* It sets the inside chars to #WORDBREAK_INSIDEACHAR and the rest to brkType.
|
|
* Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are
|
|
* cells that we really don't want to break after.
|
|
*
|
|
* @param[in] s input string
|
|
* @param[out] brks breaks array to fill
|
|
* @param[in] posStart start position
|
|
* @param[in] posEnd end position (exclusive)
|
|
* @param[in] len length of the string
|
|
* @param[in] brkType breaks type to use
|
|
* @param[in] get_next_char function to get the next UTF-32 character
|
|
*/
|
|
static void set_brks_to(
|
|
const void *s,
|
|
char *brks,
|
|
size_t posStart,
|
|
size_t posEnd,
|
|
size_t len,
|
|
char brkType,
|
|
get_next_char_t get_next_char)
|
|
{
|
|
size_t posNext = posStart;
|
|
while (posNext < posEnd)
|
|
{
|
|
utf32_t ch;
|
|
ch = get_next_char(s, len, &posNext);
|
|
assert(ch != EOS);
|
|
for (; posStart < posNext - 1; ++posStart)
|
|
brks[posStart] = WORDBREAK_INSIDEACHAR;
|
|
assert(posStart == posNext - 1);
|
|
|
|
/* Only set it if we haven't set it not to break before. */
|
|
if (brks[posStart] != WORDBREAK_NOBREAK)
|
|
brks[posStart] = brkType;
|
|
posStart = posNext;
|
|
}
|
|
}
|
|
|
|
/* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */
|
|
#define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
|
|
(cls == WBP_LF))
|
|
|
|
/**
|
|
* Sets the word breaking information for a generic input string.
|
|
*
|
|
* @param[in] s input string
|
|
* @param[in] len length of the input
|
|
* @param[in] lang language of the input
|
|
* @param[out] brks pointer to the output breaking data, containing
|
|
* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
|
|
* #WORDBREAK_INSIDEACHAR
|
|
* @param[in] get_next_char function to get the next UTF-32 character
|
|
*/
|
|
static void set_wordbreaks(
|
|
const void *s,
|
|
size_t len,
|
|
const char *lang,
|
|
char *brks,
|
|
get_next_char_t get_next_char)
|
|
{
|
|
enum WordBreakClass wbcLast = WBP_Undefined;
|
|
/* wbcSeqStart is the class that started the current sequence.
|
|
* WBP_Undefined is a special case that means "sot".
|
|
* This value is the class that is at the start of the current rule
|
|
* matching sequence. For example, in case of Numeric+MidNum+Numeric
|
|
* it'll be Numeric all the way.
|
|
*/
|
|
enum WordBreakClass wbcSeqStart = WBP_Undefined;
|
|
utf32_t ch;
|
|
size_t posNext = 0;
|
|
size_t posCur = 0;
|
|
size_t posLast = 0;
|
|
|
|
/* TODO: Language-specific specialization. */
|
|
(void) lang;
|
|
|
|
/* Init brks. */
|
|
memset(brks, WORDBREAK_BREAK, len);
|
|
|
|
ch = get_next_char(s, len, &posNext);
|
|
|
|
while (ch != EOS)
|
|
{
|
|
enum WordBreakClass wbcCur;
|
|
wbcCur = get_char_wb_class(ch, wb_prop_default,
|
|
ARRAY_LEN(wb_prop_default));
|
|
|
|
switch (wbcCur)
|
|
{
|
|
case WBP_CR:
|
|
/* WB3b */
|
|
set_brks_to(s, brks, posLast, posCur, len,
|
|
WORDBREAK_BREAK, get_next_char);
|
|
wbcSeqStart = wbcCur;
|
|
posLast = posCur;
|
|
break;
|
|
|
|
case WBP_LF:
|
|
if (wbcSeqStart == WBP_CR) /* WB3 */
|
|
{
|
|
set_brks_to(s, brks, posLast, posCur, len,
|
|
WORDBREAK_NOBREAK, get_next_char);
|
|
wbcSeqStart = wbcCur;
|
|
posLast = posCur;
|
|
break;
|
|
}
|
|
/* Fall off */
|
|
|
|
case WBP_Newline:
|
|
/* WB3a,3b */
|
|
set_brks_to(s, brks, posLast, posCur, len,
|
|
WORDBREAK_BREAK, get_next_char);
|
|
wbcSeqStart = wbcCur;
|
|
posLast = posCur;
|
|
break;
|
|
|
|
case WBP_Extend:
|
|
case WBP_Format:
|
|
/* WB4 - If not the first char/after a newline (WB3a,3b), skip
|
|
* this class, set it to be the same as the prev, and mark
|
|
* brks not to break before them. */
|
|
if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
|
|
{
|
|
set_brks_to(s, brks, posLast, posCur, len,
|
|
WORDBREAK_BREAK, get_next_char);
|
|
wbcSeqStart = wbcCur;
|
|
}
|
|
else
|
|
{
|
|
/* It's surely not the first */
|
|
brks[posCur - 1] = WORDBREAK_NOBREAK;
|
|
/* "inherit" the previous class. */
|
|
wbcCur = wbcLast;
|
|
}
|
|
break;
|
|
|
|
case WBP_Katakana:
|
|
if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
|
|
(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
|
|
{
|
|
set_brks_to(s, brks, posLast, posCur, len,
|
|
WORDBREAK_NOBREAK, get_next_char);
|
|
}
|
|
/* No rule found, reset */
|
|
else
|
|
{
|
|
set_brks_to(s, brks, posLast, posCur, len,
|
|
WORDBREAK_BREAK, get_next_char);
|
|
}
|
|
wbcSeqStart = wbcCur;
|
|
posLast = posCur;
|
|
break;
|
|
|
|
case WBP_ALetter:
|
|
if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */
|
|
(wbcLast == WBP_Numeric) || /* WB10 */
|
|
(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
|
|
{
|
|
set_brks_to(s, brks, posLast, posCur, len,
|
|
WORDBREAK_NOBREAK, get_next_char);
|
|
}
|
|
/* No rule found, reset */
|
|
else
|
|
{
|
|
set_brks_to(s, brks, posLast, posCur, len,
|
|
WORDBREAK_BREAK, get_next_char);
|
|
}
|
|
wbcSeqStart = wbcCur;
|
|
posLast = posCur;
|
|
break;
|
|
|
|
case WBP_MidNumLet:
|
|
if ((wbcLast == WBP_ALetter) || /* WB6,7 */
|
|
(wbcLast == WBP_Numeric)) /* WB11,12 */
|
|
{
|
|
/* Go on */
|
|
}
|
|
else
|
|
{
|
|
set_brks_to(s, brks, posLast, posCur, len,
|
|
WORDBREAK_BREAK, get_next_char);
|
|
wbcSeqStart = wbcCur;
|
|
posLast = posCur;
|
|
}
|
|
break;
|
|
|
|
case WBP_MidLetter:
|
|
if (wbcLast == WBP_ALetter) /* WB6,7 */
|
|
{
|
|
/* Go on */
|
|
}
|
|
else
|
|
{
|
|
set_brks_to(s, brks, posLast, posCur, len,
|
|
WORDBREAK_BREAK, get_next_char);
|
|
wbcSeqStart = wbcCur;
|
|
posLast = posCur;
|
|
}
|
|
break;
|
|
|
|
case WBP_MidNum:
|
|
if (wbcLast == WBP_Numeric) /* WB11,12 */
|
|
{
|
|
/* Go on */
|
|
}
|
|
else
|
|
{
|
|
set_brks_to(s, brks, posLast, posCur, len,
|
|
WORDBREAK_BREAK, get_next_char);
|
|
wbcSeqStart = wbcCur;
|
|
posLast = posCur;
|
|
}
|
|
break;
|
|
|
|
case WBP_Numeric:
|
|
if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
|
|
(wbcLast == WBP_ALetter) || /* WB9 */
|
|
(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
|
|
{
|
|
set_brks_to(s, brks, posLast, posCur, len,
|
|
WORDBREAK_NOBREAK, get_next_char);
|
|
}
|
|
/* No rule found, reset */
|
|
else
|
|
{
|
|
set_brks_to(s, brks, posLast, posCur, len,
|
|
WORDBREAK_BREAK, get_next_char);
|
|
}
|
|
wbcSeqStart = wbcCur;
|
|
posLast = posCur;
|
|
break;
|
|
|
|
case WBP_ExtendNumLet:
|
|
/* WB13a,13b */
|
|
if ((wbcSeqStart == wbcLast) &&
|
|
((wbcLast == WBP_ALetter) ||
|
|
(wbcLast == WBP_Numeric) ||
|
|
(wbcLast == WBP_Katakana) ||
|
|
(wbcLast == WBP_ExtendNumLet)))
|
|
{
|
|
set_brks_to(s, brks, posLast, posCur, len,
|
|
WORDBREAK_NOBREAK, get_next_char);
|
|
}
|
|
/* No rule found, reset */
|
|
else
|
|
{
|
|
set_brks_to(s, brks, posLast, posCur, len,
|
|
WORDBREAK_BREAK, get_next_char);
|
|
}
|
|
wbcSeqStart = wbcCur;
|
|
posLast = posCur;
|
|
break;
|
|
|
|
case WBP_Any:
|
|
/* Allow breaks and reset */
|
|
set_brks_to(s, brks, posLast, posCur, len,
|
|
WORDBREAK_BREAK, get_next_char);
|
|
wbcSeqStart = wbcCur;
|
|
posLast = posCur;
|
|
break;
|
|
|
|
default:
|
|
/* Error, should never get here! */
|
|
assert(0);
|
|
break;
|
|
}
|
|
|
|
wbcLast = wbcCur;
|
|
posCur = posNext;
|
|
ch = get_next_char(s, len, &posNext);
|
|
}
|
|
|
|
/* WB2 */
|
|
set_brks_to(s, brks, posLast, posNext, len,
|
|
WORDBREAK_BREAK, get_next_char);
|
|
}
|
|
|
|
/**
|
|
* Sets the word breaking information for a UTF-8 input string.
|
|
*
|
|
* @param[in] s input UTF-8 string
|
|
* @param[in] len length of the input
|
|
* @param[in] lang language of the input
|
|
* @param[out] brks pointer to the output breaking data, containing
|
|
* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
|
|
* #WORDBREAK_INSIDEACHAR
|
|
*/
|
|
void set_wordbreaks_utf8(
|
|
const utf8_t *s,
|
|
size_t len,
|
|
const char *lang,
|
|
char *brks)
|
|
{
|
|
set_wordbreaks(s, len, lang, brks,
|
|
(get_next_char_t)lb_get_next_char_utf8);
|
|
}
|
|
|
|
/**
|
|
* Sets the word breaking information for a UTF-16 input string.
|
|
*
|
|
* @param[in] s input UTF-16 string
|
|
* @param[in] len length of the input
|
|
* @param[in] lang language of the input
|
|
* @param[out] brks pointer to the output breaking data, containing
|
|
* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
|
|
* #WORDBREAK_INSIDEACHAR
|
|
*/
|
|
void set_wordbreaks_utf16(
|
|
const utf16_t *s,
|
|
size_t len,
|
|
const char *lang,
|
|
char *brks)
|
|
{
|
|
set_wordbreaks(s, len, lang, brks,
|
|
(get_next_char_t)lb_get_next_char_utf16);
|
|
}
|
|
|
|
/**
|
|
* Sets the word breaking information for a UTF-32 input string.
|
|
*
|
|
* @param[in] s input UTF-32 string
|
|
* @param[in] len length of the input
|
|
* @param[in] lang language of the input
|
|
* @param[out] brks pointer to the output breaking data, containing
|
|
* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
|
|
* #WORDBREAK_INSIDEACHAR
|
|
*/
|
|
void set_wordbreaks_utf32(
|
|
const utf32_t *s,
|
|
size_t len,
|
|
const char *lang,
|
|
char *brks)
|
|
{
|
|
set_wordbreaks(s, len, lang, brks,
|
|
(get_next_char_t)lb_get_next_char_utf32);
|
|
}
|