|
libunibreak 5.1
|
Implementation of the word breaking algorithm as described in Unicode Standard Annex 29. More...
#include <assert.h>#include <stddef.h>#include <string.h>#include "unibreakdef.h"#include "wordbreak.h"#include "wordbreakdata.c"#include "emojidef.h"Macros | |
| #define | IS_WB3ab(cls) |
Functions | |
| void | init_wordbreak (void) |
| Initializes the wordbreak internals. More... | |
| static enum WordBreakClass | get_char_wb_class (utf32_t ch, const struct WordBreakProperties *wbp, size_t len) |
| Gets the word breaking class of a character. More... | |
| static void | set_brks_to (const void *s, char *brks, size_t posStart, size_t posEnd, size_t len, char brkType, get_next_char_t get_next_char) |
| Sets the word break types to a specific value in a range. More... | |
| static void | set_wordbreaks (const void *s, size_t len, const char *lang, char *brks, get_next_char_t get_next_char) |
| Sets the word breaking information for a generic input string. More... | |
| void | set_wordbreaks_utf8 (const utf8_t *s, size_t len, const char *lang, char *brks) |
| Sets the word breaking information for a UTF-8 input string. More... | |
| void | set_wordbreaks_utf16 (const utf16_t *s, size_t len, const char *lang, char *brks) |
| Sets the word breaking information for a UTF-16 input string. More... | |
| void | set_wordbreaks_utf32 (const utf32_t *s, size_t len, const char *lang, char *brks) |
| Sets the word breaking information for a UTF-32 input string. More... | |
Implementation of the word breaking algorithm as described in Unicode Standard Annex 29.
| #define IS_WB3ab | ( | cls | ) |
|
static |
Gets the word breaking class of a character.
| ch | character to check |
| wbp | pointer to the wbp breaking properties array |
| len | size of the wbp array in number of items |
WBP_Any otherwise | void init_wordbreak | ( | void | ) |
Initializes the wordbreak internals.
It currently does nothing, but it may in the future.
|
static |
Sets the word break types to a specific value in a range.
It sets the inside chars to WORDBREAK_INSIDEACHAR and the rest to brkType. Assumes brks is initialized - all the cells with WORDBREAK_NOBREAK are cells that we really don't want to break after.
| [in] | s | input string |
| [out] | brks | breaks array to fill |
| [in] | posStart | start position |
| [in] | posEnd | end position (exclusive) |
| [in] | len | length of the string |
| [in] | brkType | breaks type to use |
| [in] | get_next_char | function to get the next UTF-32 character |
|
static |
Sets the word breaking information for a generic input string.
| [in] | s | input string |
| [in] | len | length of the input |
| [in] | lang | language of the input (reserved for future use) |
| [out] | brks | pointer to the output breaking data, containing WORDBREAK_BREAK, WORDBREAK_NOBREAK, or WORDBREAK_INSIDEACHAR |
| [in] | get_next_char | function to get the next UTF-32 character |
| void set_wordbreaks_utf16 | ( | const utf16_t * | s, |
| size_t | len, | ||
| const char * | lang, | ||
| char * | brks | ||
| ) |
Sets the word breaking information for a UTF-16 input string.
| [in] | s | input UTF-16 string |
| [in] | len | length of the input |
| [in] | lang | language of the input (reserved for future use) |
| [out] | brks | pointer to the output breaking data, containing WORDBREAK_BREAK, WORDBREAK_NOBREAK, or WORDBREAK_INSIDEACHAR |
| void set_wordbreaks_utf32 | ( | const utf32_t * | s, |
| size_t | len, | ||
| const char * | lang, | ||
| char * | brks | ||
| ) |
Sets the word breaking information for a UTF-32 input string.
| [in] | s | input UTF-32 string |
| [in] | len | length of the input |
| [in] | lang | language of the input (reserved for future use) |
| [out] | brks | pointer to the output breaking data, containing WORDBREAK_BREAK, WORDBREAK_NOBREAK, or WORDBREAK_INSIDEACHAR |
| void set_wordbreaks_utf8 | ( | const utf8_t * | s, |
| size_t | len, | ||
| const char * | lang, | ||
| char * | brks | ||
| ) |
Sets the word breaking information for a UTF-8 input string.
| [in] | s | input UTF-8 string |
| [in] | len | length of the input |
| [in] | lang | language of the input (reserved for future use) |
| [out] | brks | pointer to the output breaking data, containing WORDBREAK_BREAK, WORDBREAK_NOBREAK, or WORDBREAK_INSIDEACHAR |