harbour-books/linebreak/linebreak/wordbreak.c

/* vim: set tabstop=4 shiftwidth=4: */

/*
 * Word breaking in a Unicode sequence.  Designed to be used in a
 * generic text renderer.
 *
 * Copyright (C) 2012 Tom Hacohen <tom@stosb.com>
 *
 * This software is provided 'as-is', without any express or implied
 * warranty.  In no event will the author be held liable for any damages
 * arising from the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute
 * it freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must
 *    not claim that you wrote the original software.  If you use this
 *    software in a product, an acknowledgement in the product
 *    documentation would be appreciated but is not required.
 * 2. Altered source versions must be plainly marked as such, and must
 *    not be misrepresented as being the original software.
 * 3. This notice may not be removed or altered from any source
 *    distribution.
 *
 * The main reference is Unicode Standard Annex 29 (UAX #29):
 *		<URL:http://unicode.org/reports/tr29>
 *
 * When this library was designed, this annex was at Revision 17, for
 * Unicode 6.0.0:
 *		<URL:http://www.unicode.org/reports/tr29/tr29-17.html>
 *
 * The Unicode Terms of Use are available at
 *		<URL:http://www.unicode.org/copyright.html>
 */

/**
 * @file	wordbreak.c
 *
 * Implementation of the word breaking algorithm as described in Unicode
 * Standard Annex 29.
 *
 * @version	2.2, 2012/02/04
 * @author	Tom Hacohen
 */

#include <assert.h>
#include <stddef.h>
#include <string.h>
#include "linebreak.h"
#include "linebreakdef.h"

#include "wordbreak.h"
#include "wordbreakdata.c"

#define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))

/**
 * Initializes the wordbreak internals.  It currently does nothing, but
 * it may in the future.
 */
void init_wordbreak(void)
{
}

/**
 * Gets the word breaking class of a character.
 *
 * @param ch	character to check
 * @param wbp	pointer to the wbp breaking properties array
 * @param len	size of the wbp array in number of items
 * @return		the word breaking class if found; \c WBP_Any otherwise
 */
static enum WordBreakClass get_char_wb_class(
		utf32_t ch,
		struct WordBreakProperties *wbp,
		size_t len)
{
	int min = 0;
	int max = len - 1;
	int mid;

	do
	{
		mid = (min + max) / 2;

		if (ch < wbp[mid].start)
			max = mid - 1;
		else if (ch > wbp[mid].end)
			min = mid + 1;
		else
			return wbp[mid].prop;
	}
	while (min <= max);

	return WBP_Any;
}

/**
 * Sets the word break types to a specific value in a range.
 *
 * It sets the inside chars to #WORDBREAK_INSIDEACHAR and the rest to brkType.
 * Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are
 * cells that we really don't want to break after.
 *
 * @param[in]  s			input string
 * @param[out] brks			breaks array to fill
 * @param[in]  posStart		start position
 * @param[in]  posEnd		end position (exclusive)
 * @param[in]  len			length of the string
 * @param[in]  brkType		breaks type to use
 * @param[in] get_next_char	function to get the next UTF-32 character
 */
static void set_brks_to(
		const void *s,
		char *brks,
		size_t posStart,
		size_t posEnd,
		size_t len,
		char brkType,
		get_next_char_t get_next_char)
{
	size_t posNext = posStart;
	while (posNext < posEnd)
	{
		utf32_t ch;
		ch = get_next_char(s, len, &posNext);
		assert(ch != EOS);
		for (; posStart < posNext - 1; ++posStart)
			brks[posStart] = WORDBREAK_INSIDEACHAR;
		assert(posStart == posNext - 1);

		/* Only set it if we haven't set it not to break before. */
		if (brks[posStart] != WORDBREAK_NOBREAK)
			brks[posStart] = brkType;
		posStart = posNext;
	}
}

/* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */
#define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
					   (cls == WBP_LF))

/**
 * Sets the word breaking information for a generic input string.
 *
 * @param[in]  s			input string
 * @param[in]  len			length of the input
 * @param[in]  lang			language of the input
 * @param[out] brks			pointer to the output breaking data, containing
 *							#WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 *							#WORDBREAK_INSIDEACHAR
 * @param[in] get_next_char	function to get the next UTF-32 character
 */
static void set_wordbreaks(
		const void *s,
		size_t len,
		const char *lang,
		char *brks,
		get_next_char_t get_next_char)
{
	enum WordBreakClass wbcLast = WBP_Undefined;
	/* wbcSeqStart is the class that started the current sequence.
	 * WBP_Undefined is a special case that means "sot".
	 * This value is the class that is at the start of the current rule
	 * matching sequence. For example, in case of Numeric+MidNum+Numeric
	 * it'll be Numeric all the way.
	 */
	enum WordBreakClass wbcSeqStart = WBP_Undefined;
	utf32_t ch;
	size_t posNext = 0;
	size_t posCur = 0;
	size_t posLast = 0;

	/* TODO: Language-specific specialization. */
	(void) lang;

	/* Init brks. */
	memset(brks, WORDBREAK_BREAK, len);

	ch = get_next_char(s, len, &posNext);

	while (ch != EOS)
	{
		enum WordBreakClass wbcCur;
		wbcCur = get_char_wb_class(ch, wb_prop_default,
								   ARRAY_LEN(wb_prop_default));

		switch (wbcCur)
		{
	    case WBP_CR:
			/* WB3b */
			set_brks_to(s, brks, posLast, posCur, len,
						WORDBREAK_BREAK, get_next_char);
			wbcSeqStart = wbcCur;
			posLast = posCur;
			break;

	    case WBP_LF:
			if (wbcSeqStart == WBP_CR) /* WB3 */
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_NOBREAK, get_next_char);
				wbcSeqStart = wbcCur;
				posLast = posCur;
				break;
			}
			/* Fall off */

	    case WBP_Newline:
			/* WB3a,3b */
			set_brks_to(s, brks, posLast, posCur, len,
						WORDBREAK_BREAK, get_next_char);
			wbcSeqStart = wbcCur;
			posLast = posCur;
			break;

	    case WBP_Extend:
	    case WBP_Format:
			/* WB4 - If not the first char/after a newline (WB3a,3b), skip
			 * this class, set it to be the same as the prev, and mark
			 * brks not to break before them. */
			if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_BREAK, get_next_char);
				wbcSeqStart = wbcCur;
			}
			else
			{
				/* It's surely not the first */
				brks[posCur - 1] = WORDBREAK_NOBREAK;
				/* "inherit" the previous class. */
				wbcCur = wbcLast;
			}
			break;

	    case WBP_Katakana:
			if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
					(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_NOBREAK, get_next_char);
			}
			/* No rule found, reset */
			else
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_BREAK, get_next_char);
			}
			wbcSeqStart = wbcCur;
			posLast = posCur;
			break;

	    case WBP_ALetter:
			if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */
					(wbcLast == WBP_Numeric) || /* WB10 */
					(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_NOBREAK, get_next_char);
			}
			/* No rule found, reset */
			else
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_BREAK, get_next_char);
			}
			wbcSeqStart = wbcCur;
			posLast = posCur;
			break;

	    case WBP_MidNumLet:
			if ((wbcLast == WBP_ALetter) || /* WB6,7 */
					(wbcLast == WBP_Numeric)) /* WB11,12 */
			{
				/* Go on */
			}
			else
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_BREAK, get_next_char);
				wbcSeqStart = wbcCur;
				posLast = posCur;
			}
			break;

	    case WBP_MidLetter:
			if (wbcLast == WBP_ALetter) /* WB6,7 */
			{
				/* Go on */
			}
			else
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_BREAK, get_next_char);
				wbcSeqStart = wbcCur;
				posLast = posCur;
			}
			break;

	    case WBP_MidNum:
			if (wbcLast == WBP_Numeric) /* WB11,12 */
			{
				/* Go on */
			}
			else
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_BREAK, get_next_char);
				wbcSeqStart = wbcCur;
				posLast = posCur;
			}
			break;

	    case WBP_Numeric:
			if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
					(wbcLast == WBP_ALetter) || /* WB9 */
					(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_NOBREAK, get_next_char);
			}
			/* No rule found, reset */
			else
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_BREAK, get_next_char);
			}
			wbcSeqStart = wbcCur;
			posLast = posCur;
			break;

	    case WBP_ExtendNumLet:
			/* WB13a,13b */
			if ((wbcSeqStart == wbcLast) &&
				((wbcLast == WBP_ALetter) ||
				 (wbcLast == WBP_Numeric) ||
				 (wbcLast == WBP_Katakana) ||
				 (wbcLast == WBP_ExtendNumLet)))
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_NOBREAK, get_next_char);
			}
			/* No rule found, reset */
			else
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_BREAK, get_next_char);
			}
			wbcSeqStart = wbcCur;
			posLast = posCur;
			break;

		 case WBP_Any:
			/* Allow breaks and reset */
			set_brks_to(s, brks, posLast, posCur, len,
						WORDBREAK_BREAK, get_next_char);
			wbcSeqStart = wbcCur;
			posLast = posCur;
			break;

	    default:
			/* Error, should never get here! */
			assert(0);
			break;
		}

		wbcLast = wbcCur;
		posCur = posNext;
		ch = get_next_char(s, len, &posNext);
    }

	/* WB2 */
	set_brks_to(s, brks, posLast, posNext, len,
				WORDBREAK_BREAK, get_next_char);
}

/**
 * Sets the word breaking information for a UTF-8 input string.
 *
 * @param[in]  s	input UTF-8 string
 * @param[in]  len	length of the input
 * @param[in]  lang	language of the input
 * @param[out] brks	pointer to the output breaking data, containing
 *					#WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 *					#WORDBREAK_INSIDEACHAR
 */
void set_wordbreaks_utf8(
		const utf8_t *s,
		size_t len,
		const char *lang,
		char *brks)
{
	set_wordbreaks(s, len, lang, brks,
				   (get_next_char_t)lb_get_next_char_utf8);
}

/**
 * Sets the word breaking information for a UTF-16 input string.
 *
 * @param[in]  s	input UTF-16 string
 * @param[in]  len	length of the input
 * @param[in]  lang	language of the input
 * @param[out] brks	pointer to the output breaking data, containing
 *					#WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 *					#WORDBREAK_INSIDEACHAR
 */
void set_wordbreaks_utf16(
		const utf16_t *s,
		size_t len,
		const char *lang,
		char *brks)
{
	set_wordbreaks(s, len, lang, brks,
				   (get_next_char_t)lb_get_next_char_utf16);
}

/**
 * Sets the word breaking information for a UTF-32 input string.
 *
 * @param[in]  s	input UTF-32 string
 * @param[in]  len	length of the input
 * @param[in]  lang	language of the input
 * @param[out] brks	pointer to the output breaking data, containing
 *					#WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 *					#WORDBREAK_INSIDEACHAR
 */
void set_wordbreaks_utf32(
		const utf32_t *s,
		size_t len,
		const char *lang,
		char *brks)
{
	set_wordbreaks(s, len, lang, brks,
				   (get_next_char_t)lb_get_next_char_utf32);
}