mirror of
https://github.com/torrentpier/torrentpier-lts.git
synced 2025-02-28 15:10:54 +03:00
4267 lines
147 KiB
PHP
4267 lines
147 KiB
PHP
<?php if (!defined('BB_ROOT')) die(basename(__FILE__));
|
||
/**
|
||
* UTF8 support in PHP5.
|
||
* PHP5 UTF8 is a UTF8 aware library of functions mirroring PHP's own string functions.
|
||
*
|
||
* The powerful solution/contribution for UTF-8 support in your framework/CMS, written on PHP.
|
||
* This library is advance of http://sourceforge.net/projects/phputf8 (last updated in 2007).
|
||
*
|
||
* Features and benefits
|
||
* * Compatibility with the interface standard PHP functions that deal with single-byte encodings
|
||
* * Ability to work without PHP extensions ICONV and MBSTRING, if any, that are actively used!
|
||
* Uses the fastest available method between MBSTRING, ICONV, native on PHP and hacks.
|
||
* * Useful features are missing from the ICONV and MBSTRING
|
||
* * The methods that take and return a string, are able to take and return null.
|
||
* This useful for selects from a database.
|
||
* * Several methods are able to process arrays recursively:
|
||
* array_change_key_case(), convert_from(), convert_to(), strict(), is_utf8(), blocks_check(), convert_case(), lowercase(), uppercase(), unescape()
|
||
* * Validating method parameters to allowed types via reflection (You can disable it)
|
||
* * A single interface and encapsulation, You can inherit and override
|
||
* * Test coverage
|
||
* * PHP >= 5.3.x
|
||
*
|
||
* In Russian:
|
||
*
|
||
* Поддержка UTF-8 в PHP 5.
|
||
*
|
||
* Возможности и преимущества
|
||
* * Совместимость с интерфейсом стандартных PHP функций, работающих с однобайтовыми кодировками
|
||
* * Возможность работы без PHP расширений ICONV и MBSTRING, если они есть, то активно используются!
|
||
* Используется наиболее быстрый из доступных методов между MBSTRING, ICONV, родной реализацией на PHP и хаками.
|
||
* * Полезные функции, отсутствующие в ICONV и MBSTRING
|
||
* * Методы, которые принимают и возвращают строку, умеют принимать и возвращать null.
|
||
* Это удобно при выборках значений из базы данных.
|
||
* * Несколько методов умеют обрабатывать массивы рекурсивно:
|
||
* array_change_key_case(), convert_from(), convert_to(), strict(), is_utf8(), blocks_check(), convert_case(), lowercase(), uppercase(), unescape()
|
||
* * Проверка у методов входных параметров на допустимые типы через рефлексию (можно отключить)
|
||
* * Единый интерфейс и инкапсуляция, можно унаследоваться и переопределить методы
|
||
* * Покрытие тестами
|
||
* * PHP >= 5.3.x
|
||
*
|
||
* Example:
|
||
* $s = 'Hello, Привет';
|
||
* if (UTF8::is_utf8($s)) echo UTF8::strlen($s);
|
||
*
|
||
* UTF-8 encoding scheme:
|
||
* 2^7 0x00000000 — 0x0000007F 0xxxxxxx
|
||
* 2^11 0x00000080 — 0x000007FF 110xxxxx 10xxxxxx
|
||
* 2^16 0x00000800 — 0x0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
|
||
* 2^21 0x00010000 — 0x001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||
* 1-4 bytes length: 2^7 + 2^11 + 2^16 + 2^21 = 2 164 864
|
||
*
|
||
* If I was a owner of the world, I would leave only 2 encoding: UTF-8 and UTF-32 ;-)
|
||
*
|
||
* Useful links
|
||
* http://ru.wikipedia.org/wiki/UTF8
|
||
* http://www.madore.org/~david/misc/unitest/ A Unicode Test Page
|
||
* http://www.unicode.org/
|
||
* http://www.unicode.org/reports/
|
||
* http://www.unicode.org/reports/tr10/ Unicode Collation Algorithm
|
||
* http://www.unicode.org/Public/UCA/6.0.0/ Unicode Collation Algorithm
|
||
* http://www.unicode.org/reports/tr6/ A Standard Compression Scheme for Unicode
|
||
* http://www.fileformat.info/info/unicode/char/search.htm Unicode Character Search
|
||
*
|
||
* @link http://code.google.com/p/php5-utf8/
|
||
* @license http://creativecommons.org/licenses/by-sa/3.0/
|
||
* @author Nasibullin Rinat
|
||
* @version 2.3.1
|
||
*/
|
||
class UTF8
|
||
{
|
||
/**
|
||
* REPLACEMENT CHARACTER (for broken char)
|
||
*
|
||
* @var string
|
||
*/
|
||
const REPLACEMENT_CHAR = "\xEF\xBF\xBD"; #U+FFFD
|
||
|
||
/**
|
||
* Byte order mark, http://en.wikipedia.org/wiki/Byte_Order_Mark
|
||
*
|
||
* @var string
|
||
*/
|
||
const BOM = "\xEF\xBB\xBF";
|
||
|
||
/**
|
||
* Regular expression for a character in UTF-8.
|
||
* For engines, which don't support UTF8 mode.
|
||
* In PCRE use a dot (".") and the flag /u, it works much faster!
|
||
*
|
||
* @var string
|
||
*/
|
||
const CHAR_RE =
|
||
'[\x09\x0A\x0D\x20-\x7E] # ASCII strict
|
||
# [\x00-\x7F] # ASCII non-strict (including control chars)
|
||
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
||
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
||
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
|
||
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
|
||
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
|
||
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
|
||
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
|
||
';
|
||
|
||
/**
|
||
* Combining diactrical marks (Unicode 5.1).
|
||
* \p{M} in PCRE terms.
|
||
* For engines, which don't support UTF8 mode.
|
||
*
|
||
* For example, russian letters in composed form: "Ё" (U+0401), "Й" (U+0419),
|
||
* decomposed form: (U+0415 U+0308), (U+0418 U+0306)
|
||
*
|
||
* @link http://www.unicode.org/charts/PDF/U0300.pdf
|
||
* @link http://www.unicode.org/charts/PDF/U1DC0.pdf
|
||
* @link http://www.unicode.org/charts/PDF/UFE20.pdf
|
||
* @var string
|
||
*/
|
||
const DIACTRICAL_RE =
|
||
' \xcc[\x80-\xb9]|\xcd[\x80-\xaf] #UNICODE range: U+0300 — U+036F (for letters)
|
||
| \xe2\x83[\x90-\xbf] #UNICODE range: U+20D0 — U+20FF (for symbols)
|
||
| \xe1\xb7[\x80-\xbf] #UNICODE range: U+1DC0 — U+1DFF (supplement)
|
||
| \xef\xb8[\xa0-\xaf] #UNICODE range: U+FE20 — U+FE2F (combining half marks)
|
||
';
|
||
|
||
/**
|
||
* \p{Lu} in PCRE terms.
|
||
* For engines, which don't support UTF8 mode.
|
||
*
|
||
* @var string
|
||
*/
|
||
const CHAR_UPPER_RE = '[\x41-\x5a]
|
||
| \xc3[\x80-\x9e]
|
||
| \xc4[\x80-\xbf]
|
||
| \xc5[\x81-\xbd]
|
||
| \xc6[\x81-\xbc]
|
||
| \xc7[\x85-\xbe]
|
||
| \xc8[\x80-\xb2]
|
||
| \xce[\x86-\xab]
|
||
| \xcf[\x98-\xae]
|
||
| \xd0[\x80-\xaf]
|
||
| \xd1[\xa0-\xbe]
|
||
| \xd2[\x80-\xbe]
|
||
| \xd3[\x81-\xb8]
|
||
| \xd4[\x80-\xbf]
|
||
| \xd5[\x80-\x96]
|
||
| \xe1[\xb8\xb9\xba][\x80-\xbe]
|
||
| \xe1\xbb[\x80-\xb8]
|
||
| \xe1\xbc[\x88-\xbf]
|
||
| \xe1\xbd[\x88-\xaf]
|
||
| \xe1[\xbe\xbf][\x88-\xbc]
|
||
| \xef\xbc[\xa1-\xba]
|
||
';
|
||
|
||
/**
|
||
* \p{Ll} in PCRE terms.
|
||
* For engines, which don't support UTF8 mode.
|
||
*
|
||
* @var string
|
||
*/
|
||
const CHAR_LOWER_RE = '[\x61-\x7a]
|
||
| \xc2\xb5
|
||
| \xc3[\xa0-\xbf]
|
||
| \xc4[\x81-\xbe]
|
||
| \xc5[\x80-\xbe]
|
||
| \xc6[\x83-\xbf]
|
||
| \xc7[\x86-\xbf]
|
||
| \xc8[\x81-\xb3]
|
||
| \xc9[\x93-\xb5]
|
||
| \xca[\x80-\x92]
|
||
| \xce[\xac-\xbf]
|
||
| \xcf[\x80-\xaf]
|
||
| \xd0[\xb0-\xbf]
|
||
| \xd1[\x80-\xbf]
|
||
| \xd2[\x81-\xbf]
|
||
| \xd3[\x82-\xb9]
|
||
| \xd4[\x81-\x8f]
|
||
| \xd5[\xa1-\xbf]
|
||
| \xd6[\x80-\x86]
|
||
| \xe1[\xb8\xb9\xba][\x81-\xbf]
|
||
| \xe1\xbb[\x81-\xb9]
|
||
| \xe1\xbc[\x80-\xb7]
|
||
| \xe1\xbd[\x80-\xbd]
|
||
| \xe1\xbe[\x80-\xb3]
|
||
| \xe1\xbf[\x83-\xb3]
|
||
| \xef\xbd[\x81-\x9a]
|
||
';
|
||
|
||
/**
|
||
* HTML entities, examples: > Ö ˜ "
|
||
*
|
||
* @var string
|
||
*/
|
||
const HTML_ENTITY_RE = '&(?> [a-zA-Z][a-zA-Z\d]++
|
||
| \#(?> \d{1,4}+
|
||
| x[\da-fA-F]{2,4}+
|
||
)
|
||
);
|
||
';
|
||
|
||
/**
|
||
* Quotation marks.
|
||
* For engines, which don't support UTF8 mode.
|
||
*
|
||
* @var string
|
||
*/
|
||
const QUOTATION_MARK_RE = '\x22|\xc2[\xab\xbb]|\xe2\x80[\x98\x99\x9a\x9c\x9d\x9e\xb9\xba]';
|
||
|
||
/**
|
||
*
|
||
* @var array
|
||
*/
|
||
public static $html_quotation_mark_table = array(
|
||
'"' => "\x22", #U+0022 ["] " quotation mark = APL quote
|
||
'«' => "\xc2\xab", #U+00AB [«] left-pointing double angle quotation mark = left pointing guillemet
|
||
'»' => "\xc2\xbb", #U+00BB [»] right-pointing double angle quotation mark = right pointing guillemet
|
||
'‘' => "\xe2\x80\x98", #U+2018 [‘] left single quotation mark
|
||
'’' => "\xe2\x80\x99", #U+2019 [’] right single quotation mark (and apostrophe!)
|
||
'‚' => "\xe2\x80\x9a", #U+201A [‚] single low-9 quotation mark
|
||
'“' => "\xe2\x80\x9c", #U+201C [“] left double quotation mark
|
||
'”' => "\xe2\x80\x9d", #U+201D [”] right double quotation mark
|
||
'„' => "\xe2\x80\x9e", #U+201E [„] double low-9 quotation mark
|
||
'‹' => "\xe2\x80\xb9", #U+2039 [‹] single left-pointing angle quotation mark
|
||
'›' => "\xe2\x80\xba", #U+203A [›] single right-pointing angle quotation mark
|
||
);
|
||
|
||
/**
|
||
* HTML special chars table
|
||
*
|
||
* @var array
|
||
*/
|
||
public static $html_special_chars_table = array(
|
||
'"' => "\x22", #U+0022 ["] " quotation mark = APL quote
|
||
'&' => "\x26", #U+0026 [&] & ampersand
|
||
'<' => "\x3c", #U+003C [<] < less-than sign
|
||
'>' => "\x3e", #U+003E [>] > greater-than sign
|
||
#' entity is only available in XHTML/HTML5 and not in plain HTML, see http://www.w3.org/TR/xhtml1/#C_16
|
||
#''' => "\x27", #U+0027 ['] ' apostrophe
|
||
);
|
||
|
||
/**
|
||
* @link http://www.fileformat.info/format/w3c/entitytest.htm?sort=Unicode%20Character HTML Entity Browser Test Page
|
||
* @var array
|
||
*/
|
||
public static $html_entity_table = array(
|
||
#Latin-1 Entities:
|
||
' ' => "\xc2\xa0", #U+00A0 [ ] no-break space = non-breaking space
|
||
'¡' => "\xc2\xa1", #U+00A1 [¡] inverted exclamation mark
|
||
'¢' => "\xc2\xa2", #U+00A2 [¢] cent sign
|
||
'£' => "\xc2\xa3", #U+00A3 [£] pound sign
|
||
'¤' => "\xc2\xa4", #U+00A4 [¤] currency sign
|
||
'¥' => "\xc2\xa5", #U+00A5 [¥] yen sign = yuan sign
|
||
'¦' => "\xc2\xa6", #U+00A6 [¦] broken bar = broken vertical bar
|
||
'§' => "\xc2\xa7", #U+00A7 [§] section sign
|
||
'¨' => "\xc2\xa8", #U+00A8 [¨] diaeresis = spacing diaeresis
|
||
'©' => "\xc2\xa9", #U+00A9 [©] copyright sign
|
||
'ª' => "\xc2\xaa", #U+00AA [ª] feminine ordinal indicator
|
||
'«' => "\xc2\xab", #U+00AB [«] left-pointing double angle quotation mark = left pointing guillemet
|
||
'¬' => "\xc2\xac", #U+00AC [¬] not sign
|
||
'­' => "\xc2\xad", #U+00AD [ ] soft hyphen = discretionary hyphen
|
||
'®' => "\xc2\xae", #U+00AE [®] registered sign = registered trade mark sign
|
||
'¯' => "\xc2\xaf", #U+00AF [¯] macron = spacing macron = overline = APL overbar
|
||
'°' => "\xc2\xb0", #U+00B0 [°] degree sign
|
||
'±' => "\xc2\xb1", #U+00B1 [±] plus-minus sign = plus-or-minus sign
|
||
'²' => "\xc2\xb2", #U+00B2 [²] superscript two = superscript digit two = squared
|
||
'³' => "\xc2\xb3", #U+00B3 [³] superscript three = superscript digit three = cubed
|
||
'´' => "\xc2\xb4", #U+00B4 [´] acute accent = spacing acute
|
||
'µ' => "\xc2\xb5", #U+00B5 [µ] micro sign
|
||
'¶' => "\xc2\xb6", #U+00B6 [¶] pilcrow sign = paragraph sign
|
||
'·' => "\xc2\xb7", #U+00B7 [·] middle dot = Georgian comma = Greek middle dot
|
||
'¸' => "\xc2\xb8", #U+00B8 [¸] cedilla = spacing cedilla
|
||
'¹' => "\xc2\xb9", #U+00B9 [¹] superscript one = superscript digit one
|
||
'º' => "\xc2\xba", #U+00BA [º] masculine ordinal indicator
|
||
'»' => "\xc2\xbb", #U+00BB [»] right-pointing double angle quotation mark = right pointing guillemet
|
||
'¼' => "\xc2\xbc", #U+00BC [¼] vulgar fraction one quarter = fraction one quarter
|
||
'½' => "\xc2\xbd", #U+00BD [½] vulgar fraction one half = fraction one half
|
||
'¾' => "\xc2\xbe", #U+00BE [¾] vulgar fraction three quarters = fraction three quarters
|
||
'¿' => "\xc2\xbf", #U+00BF [¿] inverted question mark = turned question mark
|
||
#Latin capital letter
|
||
'À' => "\xc3\x80", #Latin capital letter A with grave = Latin capital letter A grave
|
||
'Á' => "\xc3\x81", #Latin capital letter A with acute
|
||
'Â' => "\xc3\x82", #Latin capital letter A with circumflex
|
||
'Ã' => "\xc3\x83", #Latin capital letter A with tilde
|
||
'Ä' => "\xc3\x84", #Latin capital letter A with diaeresis
|
||
'Å' => "\xc3\x85", #Latin capital letter A with ring above = Latin capital letter A ring
|
||
'Æ' => "\xc3\x86", #Latin capital letter AE = Latin capital ligature AE
|
||
'Ç' => "\xc3\x87", #Latin capital letter C with cedilla
|
||
'È' => "\xc3\x88", #Latin capital letter E with grave
|
||
'É' => "\xc3\x89", #Latin capital letter E with acute
|
||
'Ê' => "\xc3\x8a", #Latin capital letter E with circumflex
|
||
'Ë' => "\xc3\x8b", #Latin capital letter E with diaeresis
|
||
'Ì' => "\xc3\x8c", #Latin capital letter I with grave
|
||
'Í' => "\xc3\x8d", #Latin capital letter I with acute
|
||
'Î' => "\xc3\x8e", #Latin capital letter I with circumflex
|
||
'Ï' => "\xc3\x8f", #Latin capital letter I with diaeresis
|
||
'Ð' => "\xc3\x90", #Latin capital letter ETH
|
||
'Ñ' => "\xc3\x91", #Latin capital letter N with tilde
|
||
'Ò' => "\xc3\x92", #Latin capital letter O with grave
|
||
'Ó' => "\xc3\x93", #Latin capital letter O with acute
|
||
'Ô' => "\xc3\x94", #Latin capital letter O with circumflex
|
||
'Õ' => "\xc3\x95", #Latin capital letter O with tilde
|
||
'Ö' => "\xc3\x96", #Latin capital letter O with diaeresis
|
||
'×' => "\xc3\x97", #U+00D7 [×] multiplication sign
|
||
'Ø' => "\xc3\x98", #Latin capital letter O with stroke = Latin capital letter O slash
|
||
'Ù' => "\xc3\x99", #Latin capital letter U with grave
|
||
'Ú' => "\xc3\x9a", #Latin capital letter U with acute
|
||
'Û' => "\xc3\x9b", #Latin capital letter U with circumflex
|
||
'Ü' => "\xc3\x9c", #Latin capital letter U with diaeresis
|
||
'Ý' => "\xc3\x9d", #Latin capital letter Y with acute
|
||
'Þ' => "\xc3\x9e", #Latin capital letter THORN
|
||
#Latin small letter
|
||
'ß' => "\xc3\x9f", #Latin small letter sharp s = ess-zed
|
||
'à' => "\xc3\xa0", #Latin small letter a with grave = Latin small letter a grave
|
||
'á' => "\xc3\xa1", #Latin small letter a with acute
|
||
'â' => "\xc3\xa2", #Latin small letter a with circumflex
|
||
'ã' => "\xc3\xa3", #Latin small letter a with tilde
|
||
'ä' => "\xc3\xa4", #Latin small letter a with diaeresis
|
||
'å' => "\xc3\xa5", #Latin small letter a with ring above = Latin small letter a ring
|
||
'æ' => "\xc3\xa6", #Latin small letter ae = Latin small ligature ae
|
||
'ç' => "\xc3\xa7", #Latin small letter c with cedilla
|
||
'è' => "\xc3\xa8", #Latin small letter e with grave
|
||
'é' => "\xc3\xa9", #Latin small letter e with acute
|
||
'ê' => "\xc3\xaa", #Latin small letter e with circumflex
|
||
'ë' => "\xc3\xab", #Latin small letter e with diaeresis
|
||
'ì' => "\xc3\xac", #Latin small letter i with grave
|
||
'í' => "\xc3\xad", #Latin small letter i with acute
|
||
'î' => "\xc3\xae", #Latin small letter i with circumflex
|
||
'ï' => "\xc3\xaf", #Latin small letter i with diaeresis
|
||
'ð' => "\xc3\xb0", #Latin small letter eth
|
||
'ñ' => "\xc3\xb1", #Latin small letter n with tilde
|
||
'ò' => "\xc3\xb2", #Latin small letter o with grave
|
||
'ó' => "\xc3\xb3", #Latin small letter o with acute
|
||
'ô' => "\xc3\xb4", #Latin small letter o with circumflex
|
||
'õ' => "\xc3\xb5", #Latin small letter o with tilde
|
||
'ö' => "\xc3\xb6", #Latin small letter o with diaeresis
|
||
'÷' => "\xc3\xb7", #U+00F7 [÷] division sign
|
||
'ø' => "\xc3\xb8", #Latin small letter o with stroke = Latin small letter o slash
|
||
'ù' => "\xc3\xb9", #Latin small letter u with grave
|
||
'ú' => "\xc3\xba", #Latin small letter u with acute
|
||
'û' => "\xc3\xbb", #Latin small letter u with circumflex
|
||
'ü' => "\xc3\xbc", #Latin small letter u with diaeresis
|
||
'ý' => "\xc3\xbd", #Latin small letter y with acute
|
||
'þ' => "\xc3\xbe", #Latin small letter thorn
|
||
'ÿ' => "\xc3\xbf", #Latin small letter y with diaeresis
|
||
#Symbols and Greek Letters:
|
||
'ƒ' => "\xc6\x92", #U+0192 [ƒ] Latin small f with hook = function = florin
|
||
'Α' => "\xce\x91", #Greek capital letter alpha
|
||
'Β' => "\xce\x92", #Greek capital letter beta
|
||
'Γ' => "\xce\x93", #Greek capital letter gamma
|
||
'Δ' => "\xce\x94", #Greek capital letter delta
|
||
'Ε' => "\xce\x95", #Greek capital letter epsilon
|
||
'Ζ' => "\xce\x96", #Greek capital letter zeta
|
||
'Η' => "\xce\x97", #Greek capital letter eta
|
||
'Θ' => "\xce\x98", #Greek capital letter theta
|
||
'Ι' => "\xce\x99", #Greek capital letter iota
|
||
'Κ' => "\xce\x9a", #Greek capital letter kappa
|
||
'Λ' => "\xce\x9b", #Greek capital letter lambda
|
||
'Μ' => "\xce\x9c", #Greek capital letter mu
|
||
'Ν' => "\xce\x9d", #Greek capital letter nu
|
||
'Ξ' => "\xce\x9e", #Greek capital letter xi
|
||
'Ο' => "\xce\x9f", #Greek capital letter omicron
|
||
'Π' => "\xce\xa0", #Greek capital letter pi
|
||
'Ρ' => "\xce\xa1", #Greek capital letter rho
|
||
'Σ' => "\xce\xa3", #Greek capital letter sigma
|
||
'Τ' => "\xce\xa4", #Greek capital letter tau
|
||
'Υ' => "\xce\xa5", #Greek capital letter upsilon
|
||
'Φ' => "\xce\xa6", #Greek capital letter phi
|
||
'Χ' => "\xce\xa7", #Greek capital letter chi
|
||
'Ψ' => "\xce\xa8", #Greek capital letter psi
|
||
'Ω' => "\xce\xa9", #Greek capital letter omega
|
||
'α' => "\xce\xb1", #Greek small letter alpha
|
||
'β' => "\xce\xb2", #Greek small letter beta
|
||
'γ' => "\xce\xb3", #Greek small letter gamma
|
||
'δ' => "\xce\xb4", #Greek small letter delta
|
||
'ε' => "\xce\xb5", #Greek small letter epsilon
|
||
'ζ' => "\xce\xb6", #Greek small letter zeta
|
||
'η' => "\xce\xb7", #Greek small letter eta
|
||
'θ' => "\xce\xb8", #Greek small letter theta
|
||
'ι' => "\xce\xb9", #Greek small letter iota
|
||
'κ' => "\xce\xba", #Greek small letter kappa
|
||
'λ' => "\xce\xbb", #Greek small letter lambda
|
||
'μ' => "\xce\xbc", #Greek small letter mu
|
||
'ν' => "\xce\xbd", #Greek small letter nu
|
||
'ξ' => "\xce\xbe", #Greek small letter xi
|
||
'ο' => "\xce\xbf", #Greek small letter omicron
|
||
'π' => "\xcf\x80", #Greek small letter pi
|
||
'ρ' => "\xcf\x81", #Greek small letter rho
|
||
'ς' => "\xcf\x82", #Greek small letter final sigma
|
||
'σ' => "\xcf\x83", #Greek small letter sigma
|
||
'τ' => "\xcf\x84", #Greek small letter tau
|
||
'υ' => "\xcf\x85", #Greek small letter upsilon
|
||
'φ' => "\xcf\x86", #Greek small letter phi
|
||
'χ' => "\xcf\x87", #Greek small letter chi
|
||
'ψ' => "\xcf\x88", #Greek small letter psi
|
||
'ω' => "\xcf\x89", #Greek small letter omega
|
||
'ϑ'=> "\xcf\x91", #Greek small letter theta symbol
|
||
'ϒ' => "\xcf\x92", #Greek upsilon with hook symbol
|
||
'ϖ' => "\xcf\x96", #U+03D6 [ϖ] Greek pi symbol
|
||
|
||
'•' => "\xe2\x80\xa2", #U+2022 [•] bullet = black small circle
|
||
'…' => "\xe2\x80\xa6", #U+2026 […] horizontal ellipsis = three dot leader
|
||
'′' => "\xe2\x80\xb2", #U+2032 [′] prime = minutes = feet (для обозначения минут и футов)
|
||
'″' => "\xe2\x80\xb3", #U+2033 [″] double prime = seconds = inches (для обозначения секунд и дюймов).
|
||
'‾' => "\xe2\x80\xbe", #U+203E [‾] overline = spacing overscore
|
||
'⁄' => "\xe2\x81\x84", #U+2044 [⁄] fraction slash
|
||
'℘' => "\xe2\x84\x98", #U+2118 [℘] script capital P = power set = Weierstrass p
|
||
'ℑ' => "\xe2\x84\x91", #U+2111 [ℑ] blackletter capital I = imaginary part
|
||
'ℜ' => "\xe2\x84\x9c", #U+211C [ℜ] blackletter capital R = real part symbol
|
||
'™' => "\xe2\x84\xa2", #U+2122 [™] trade mark sign
|
||
'ℵ' => "\xe2\x84\xb5", #U+2135 [ℵ] alef symbol = first transfinite cardinal
|
||
'←' => "\xe2\x86\x90", #U+2190 [←] leftwards arrow
|
||
'↑' => "\xe2\x86\x91", #U+2191 [↑] upwards arrow
|
||
'→' => "\xe2\x86\x92", #U+2192 [→] rightwards arrow
|
||
'↓' => "\xe2\x86\x93", #U+2193 [↓] downwards arrow
|
||
'↔' => "\xe2\x86\x94", #U+2194 [↔] left right arrow
|
||
'↵' => "\xe2\x86\xb5", #U+21B5 [↵] downwards arrow with corner leftwards = carriage return
|
||
'⇐' => "\xe2\x87\x90", #U+21D0 [⇐] leftwards double arrow
|
||
'⇑' => "\xe2\x87\x91", #U+21D1 [⇑] upwards double arrow
|
||
'⇒' => "\xe2\x87\x92", #U+21D2 [⇒] rightwards double arrow
|
||
'⇓' => "\xe2\x87\x93", #U+21D3 [⇓] downwards double arrow
|
||
'⇔' => "\xe2\x87\x94", #U+21D4 [⇔] left right double arrow
|
||
'∀' => "\xe2\x88\x80", #U+2200 [∀] for all
|
||
'∂' => "\xe2\x88\x82", #U+2202 [∂] partial differential
|
||
'∃' => "\xe2\x88\x83", #U+2203 [∃] there exists
|
||
'∅' => "\xe2\x88\x85", #U+2205 [∅] empty set = null set = diameter
|
||
'∇' => "\xe2\x88\x87", #U+2207 [∇] nabla = backward difference
|
||
'∈' => "\xe2\x88\x88", #U+2208 [∈] element of
|
||
'∉' => "\xe2\x88\x89", #U+2209 [∉] not an element of
|
||
'∋' => "\xe2\x88\x8b", #U+220B [∋] contains as member
|
||
'∏' => "\xe2\x88\x8f", #U+220F [∏] n-ary product = product sign
|
||
'∑' => "\xe2\x88\x91", #U+2211 [∑] n-ary sumation
|
||
'−' => "\xe2\x88\x92", #U+2212 [−] minus sign
|
||
'∗' => "\xe2\x88\x97", #U+2217 [∗] asterisk operator
|
||
'√' => "\xe2\x88\x9a", #U+221A [√] square root = radical sign
|
||
'∝' => "\xe2\x88\x9d", #U+221D [∝] proportional to
|
||
'∞' => "\xe2\x88\x9e", #U+221E [∞] infinity
|
||
'∠' => "\xe2\x88\xa0", #U+2220 [∠] angle
|
||
'∧' => "\xe2\x88\xa7", #U+2227 [∧] logical and = wedge
|
||
'∨' => "\xe2\x88\xa8", #U+2228 [∨] logical or = vee
|
||
'∩' => "\xe2\x88\xa9", #U+2229 [∩] intersection = cap
|
||
'∪' => "\xe2\x88\xaa", #U+222A [∪] union = cup
|
||
'∫' => "\xe2\x88\xab", #U+222B [∫] integral
|
||
'∴' => "\xe2\x88\xb4", #U+2234 [∴] therefore
|
||
'∼' => "\xe2\x88\xbc", #U+223C [∼] tilde operator = varies with = similar to
|
||
'≅' => "\xe2\x89\x85", #U+2245 [≅] approximately equal to
|
||
'≈' => "\xe2\x89\x88", #U+2248 [≈] almost equal to = asymptotic to
|
||
'≠' => "\xe2\x89\xa0", #U+2260 [≠] not equal to
|
||
'≡' => "\xe2\x89\xa1", #U+2261 [≡] identical to
|
||
'≤' => "\xe2\x89\xa4", #U+2264 [≤] less-than or equal to
|
||
'≥' => "\xe2\x89\xa5", #U+2265 [≥] greater-than or equal to
|
||
'⊂' => "\xe2\x8a\x82", #U+2282 [⊂] subset of
|
||
'⊃' => "\xe2\x8a\x83", #U+2283 [⊃] superset of
|
||
'⊄' => "\xe2\x8a\x84", #U+2284 [⊄] not a subset of
|
||
'⊆' => "\xe2\x8a\x86", #U+2286 [⊆] subset of or equal to
|
||
'⊇' => "\xe2\x8a\x87", #U+2287 [⊇] superset of or equal to
|
||
'⊕' => "\xe2\x8a\x95", #U+2295 [⊕] circled plus = direct sum
|
||
'⊗' => "\xe2\x8a\x97", #U+2297 [⊗] circled times = vector product
|
||
'⊥' => "\xe2\x8a\xa5", #U+22A5 [⊥] up tack = orthogonal to = perpendicular
|
||
'⋅' => "\xe2\x8b\x85", #U+22C5 [⋅] dot operator
|
||
'⌈' => "\xe2\x8c\x88", #U+2308 [⌈] left ceiling = APL upstile
|
||
'⌉' => "\xe2\x8c\x89", #U+2309 [⌉] right ceiling
|
||
'⌊' => "\xe2\x8c\x8a", #U+230A [⌊] left floor = APL downstile
|
||
'⌋' => "\xe2\x8c\x8b", #U+230B [⌋] right floor
|
||
'⟨' => "\xe2\x8c\xa9", #U+2329 [〈] left-pointing angle bracket = bra
|
||
'⟩' => "\xe2\x8c\xaa", #U+232A [〉] right-pointing angle bracket = ket
|
||
'◊' => "\xe2\x97\x8a", #U+25CA [◊] lozenge
|
||
'♠' => "\xe2\x99\xa0", #U+2660 [♠] black spade suit
|
||
'♣' => "\xe2\x99\xa3", #U+2663 [♣] black club suit = shamrock
|
||
'♥' => "\xe2\x99\xa5", #U+2665 [♥] black heart suit = valentine
|
||
'♦' => "\xe2\x99\xa6", #U+2666 [♦] black diamond suit
|
||
#Other Special Characters:
|
||
'Œ' => "\xc5\x92", #U+0152 [Œ] Latin capital ligature OE
|
||
'œ' => "\xc5\x93", #U+0153 [œ] Latin small ligature oe
|
||
'Š' => "\xc5\xa0", #U+0160 [Š] Latin capital letter S with caron
|
||
'š' => "\xc5\xa1", #U+0161 [š] Latin small letter s with caron
|
||
'Ÿ' => "\xc5\xb8", #U+0178 [Ÿ] Latin capital letter Y with diaeresis
|
||
'ˆ' => "\xcb\x86", #U+02C6 [ˆ] modifier letter circumflex accent
|
||
'˜' => "\xcb\x9c", #U+02DC [˜] small tilde
|
||
' ' => "\xe2\x80\x82", #U+2002 [ ] en space
|
||
' ' => "\xe2\x80\x83", #U+2003 [ ] em space
|
||
' ' => "\xe2\x80\x89", #U+2009 [ ] thin space
|
||
'‌' => "\xe2\x80\x8c", #U+200C [] zero width non-joiner
|
||
'‍' => "\xe2\x80\x8d", #U+200D [] zero width joiner
|
||
'‎' => "\xe2\x80\x8e", #U+200E [] left-to-right mark
|
||
'‏' => "\xe2\x80\x8f", #U+200F [] right-to-left mark
|
||
'–' => "\xe2\x80\x93", #U+2013 [–] en dash
|
||
'—' => "\xe2\x80\x94", #U+2014 [—] em dash
|
||
'‘' => "\xe2\x80\x98", #U+2018 [‘] left single quotation mark
|
||
'’' => "\xe2\x80\x99", #U+2019 [’] right single quotation mark (and apostrophe!)
|
||
'‚' => "\xe2\x80\x9a", #U+201A [‚] single low-9 quotation mark
|
||
'“' => "\xe2\x80\x9c", #U+201C [“] left double quotation mark
|
||
'”' => "\xe2\x80\x9d", #U+201D [”] right double quotation mark
|
||
'„' => "\xe2\x80\x9e", #U+201E [„] double low-9 quotation mark
|
||
'†' => "\xe2\x80\xa0", #U+2020 [†] dagger
|
||
'‡' => "\xe2\x80\xa1", #U+2021 [‡] double dagger
|
||
'‰' => "\xe2\x80\xb0", #U+2030 [‰] per mille sign
|
||
'‹' => "\xe2\x80\xb9", #U+2039 [‹] single left-pointing angle quotation mark
|
||
'›' => "\xe2\x80\xba", #U+203A [›] single right-pointing angle quotation mark
|
||
'€' => "\xe2\x82\xac", #U+20AC [€] euro sign
|
||
);
|
||
|
||
/**
|
||
* This table contains the data on how cp1259 characters map into Unicode (UTF-8).
|
||
* The cp1259 map describes standart tatarish cyrillic charset and based on the cp1251 table.
|
||
* cp1259 -- this is an outdated one byte encoding of the Tatar language,
|
||
* which includes all the Russian letters from cp1251.
|
||
*
|
||
* @link http://search.cpan.org/CPAN/authors/id/A/AM/AMICHAUER/Lingua-TT-Yanalif-0.08.tar.gz
|
||
* @link http://www.unicode.org/charts/PDF/U0400.pdf
|
||
* @var array
|
||
*/
|
||
public static $cp1259_table = array(
|
||
#bytes from 0x00 to 0x7F (ASCII) saved as is
|
||
"\x80" => "\xd3\x98", #U+04d8 CYRILLIC CAPITAL LETTER SCHWA
|
||
"\x81" => "\xd0\x83", #U+0403 CYRILLIC CAPITAL LETTER GJE
|
||
"\x82" => "\xe2\x80\x9a", #U+201a SINGLE LOW-9 QUOTATION MARK
|
||
"\x83" => "\xd1\x93", #U+0453 CYRILLIC SMALL LETTER GJE
|
||
"\x84" => "\xe2\x80\x9e", #U+201e DOUBLE LOW-9 QUOTATION MARK
|
||
"\x85" => "\xe2\x80\xa6", #U+2026 HORIZONTAL ELLIPSIS
|
||
"\x86" => "\xe2\x80\xa0", #U+2020 DAGGER
|
||
"\x87" => "\xe2\x80\xa1", #U+2021 DOUBLE DAGGER
|
||
"\x88" => "\xe2\x82\xac", #U+20ac EURO SIGN
|
||
"\x89" => "\xe2\x80\xb0", #U+2030 PER MILLE SIGN
|
||
"\x8a" => "\xd3\xa8", #U+04e8 CYRILLIC CAPITAL LETTER BARRED O
|
||
"\x8b" => "\xe2\x80\xb9", #U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||
"\x8c" => "\xd2\xae", #U+04ae CYRILLIC CAPITAL LETTER STRAIGHT U
|
||
"\x8d" => "\xd2\x96", #U+0496 CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER
|
||
"\x8e" => "\xd2\xa2", #U+04a2 CYRILLIC CAPITAL LETTER EN WITH HOOK
|
||
"\x8f" => "\xd2\xba", #U+04ba CYRILLIC CAPITAL LETTER SHHA
|
||
"\x90" => "\xd3\x99", #U+04d9 CYRILLIC SMALL LETTER SCHWA
|
||
"\x91" => "\xe2\x80\x98", #U+2018 LEFT SINGLE QUOTATION MARK
|
||
"\x92" => "\xe2\x80\x99", #U+2019 RIGHT SINGLE QUOTATION MARK
|
||
"\x93" => "\xe2\x80\x9c", #U+201c LEFT DOUBLE QUOTATION MARK
|
||
"\x94" => "\xe2\x80\x9d", #U+201d RIGHT DOUBLE QUOTATION MARK
|
||
"\x95" => "\xe2\x80\xa2", #U+2022 BULLET
|
||
"\x96" => "\xe2\x80\x93", #U+2013 EN DASH
|
||
"\x97" => "\xe2\x80\x94", #U+2014 EM DASH
|
||
#"\x98" #UNDEFINED
|
||
"\x99" => "\xe2\x84\xa2", #U+2122 TRADE MARK SIGN
|
||
"\x9a" => "\xd3\xa9", #U+04e9 CYRILLIC SMALL LETTER BARRED O
|
||
"\x9b" => "\xe2\x80\xba", #U+203a SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||
"\x9c" => "\xd2\xaf", #U+04af CYRILLIC SMALL LETTER STRAIGHT U
|
||
"\x9d" => "\xd2\x97", #U+0497 CYRILLIC SMALL LETTER ZHE WITH DESCENDER
|
||
"\x9e" => "\xd2\xa3", #U+04a3 CYRILLIC SMALL LETTER EN WITH HOOK
|
||
"\x9f" => "\xd2\xbb", #U+04bb CYRILLIC SMALL LETTER SHHA
|
||
"\xa0" => "\xc2\xa0", #U+00a0 NO-BREAK SPACE
|
||
"\xa1" => "\xd0\x8e", #U+040e CYRILLIC CAPITAL LETTER SHORT U
|
||
"\xa2" => "\xd1\x9e", #U+045e CYRILLIC SMALL LETTER SHORT U
|
||
"\xa3" => "\xd0\x88", #U+0408 CYRILLIC CAPITAL LETTER JE
|
||
"\xa4" => "\xc2\xa4", #U+00a4 CURRENCY SIGN
|
||
"\xa5" => "\xd2\x90", #U+0490 CYRILLIC CAPITAL LETTER GHE WITH UPTURN
|
||
"\xa6" => "\xc2\xa6", #U+00a6 BROKEN BAR
|
||
"\xa7" => "\xc2\xa7", #U+00a7 SECTION SIGN
|
||
"\xa8" => "\xd0\x81", #U+0401 CYRILLIC CAPITAL LETTER IO
|
||
"\xa9" => "\xc2\xa9", #U+00a9 COPYRIGHT SIGN
|
||
"\xaa" => "\xd0\x84", #U+0404 CYRILLIC CAPITAL LETTER UKRAINIAN IE
|
||
"\xab" => "\xc2\xab", #U+00ab LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||
"\xac" => "\xc2\xac", #U+00ac NOT SIGN
|
||
"\xad" => "\xc2\xad", #U+00ad SOFT HYPHEN
|
||
"\xae" => "\xc2\xae", #U+00ae REGISTERED SIGN
|
||
"\xaf" => "\xd0\x87", #U+0407 CYRILLIC CAPITAL LETTER YI
|
||
"\xb0" => "\xc2\xb0", #U+00b0 DEGREE SIGN
|
||
"\xb1" => "\xc2\xb1", #U+00b1 PLUS-MINUS SIGN
|
||
"\xb2" => "\xd0\x86", #U+0406 CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I
|
||
"\xb3" => "\xd1\x96", #U+0456 CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
|
||
"\xb4" => "\xd2\x91", #U+0491 CYRILLIC SMALL LETTER GHE WITH UPTURN
|
||
"\xb5" => "\xc2\xb5", #U+00b5 MICRO SIGN
|
||
"\xb6" => "\xc2\xb6", #U+00b6 PILCROW SIGN
|
||
"\xb7" => "\xc2\xb7", #U+00b7 MIDDLE DOT
|
||
"\xb8" => "\xd1\x91", #U+0451 CYRILLIC SMALL LETTER IO
|
||
"\xb9" => "\xe2\x84\x96", #U+2116 NUMERO SIGN
|
||
"\xba" => "\xd1\x94", #U+0454 CYRILLIC SMALL LETTER UKRAINIAN IE
|
||
"\xbb" => "\xc2\xbb", #U+00bb RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||
"\xbc" => "\xd1\x98", #U+0458 CYRILLIC SMALL LETTER JE
|
||
"\xbd" => "\xd0\x85", #U+0405 CYRILLIC CAPITAL LETTER DZE
|
||
"\xbe" => "\xd1\x95", #U+0455 CYRILLIC SMALL LETTER DZE
|
||
"\xbf" => "\xd1\x97", #U+0457 CYRILLIC SMALL LETTER YI
|
||
"\xc0" => "\xd0\x90", #U+0410 CYRILLIC CAPITAL LETTER A
|
||
"\xc1" => "\xd0\x91", #U+0411 CYRILLIC CAPITAL LETTER BE
|
||
"\xc2" => "\xd0\x92", #U+0412 CYRILLIC CAPITAL LETTER VE
|
||
"\xc3" => "\xd0\x93", #U+0413 CYRILLIC CAPITAL LETTER GHE
|
||
"\xc4" => "\xd0\x94", #U+0414 CYRILLIC CAPITAL LETTER DE
|
||
"\xc5" => "\xd0\x95", #U+0415 CYRILLIC CAPITAL LETTER IE
|
||
"\xc6" => "\xd0\x96", #U+0416 CYRILLIC CAPITAL LETTER ZHE
|
||
"\xc7" => "\xd0\x97", #U+0417 CYRILLIC CAPITAL LETTER ZE
|
||
"\xc8" => "\xd0\x98", #U+0418 CYRILLIC CAPITAL LETTER I
|
||
"\xc9" => "\xd0\x99", #U+0419 CYRILLIC CAPITAL LETTER SHORT I
|
||
"\xca" => "\xd0\x9a", #U+041a CYRILLIC CAPITAL LETTER KA
|
||
"\xcb" => "\xd0\x9b", #U+041b CYRILLIC CAPITAL LETTER EL
|
||
"\xcc" => "\xd0\x9c", #U+041c CYRILLIC CAPITAL LETTER EM
|
||
"\xcd" => "\xd0\x9d", #U+041d CYRILLIC CAPITAL LETTER EN
|
||
"\xce" => "\xd0\x9e", #U+041e CYRILLIC CAPITAL LETTER O
|
||
"\xcf" => "\xd0\x9f", #U+041f CYRILLIC CAPITAL LETTER PE
|
||
"\xd0" => "\xd0\xa0", #U+0420 CYRILLIC CAPITAL LETTER ER
|
||
"\xd1" => "\xd0\xa1", #U+0421 CYRILLIC CAPITAL LETTER ES
|
||
"\xd2" => "\xd0\xa2", #U+0422 CYRILLIC CAPITAL LETTER TE
|
||
"\xd3" => "\xd0\xa3", #U+0423 CYRILLIC CAPITAL LETTER U
|
||
"\xd4" => "\xd0\xa4", #U+0424 CYRILLIC CAPITAL LETTER EF
|
||
"\xd5" => "\xd0\xa5", #U+0425 CYRILLIC CAPITAL LETTER HA
|
||
"\xd6" => "\xd0\xa6", #U+0426 CYRILLIC CAPITAL LETTER TSE
|
||
"\xd7" => "\xd0\xa7", #U+0427 CYRILLIC CAPITAL LETTER CHE
|
||
"\xd8" => "\xd0\xa8", #U+0428 CYRILLIC CAPITAL LETTER SHA
|
||
"\xd9" => "\xd0\xa9", #U+0429 CYRILLIC CAPITAL LETTER SHCHA
|
||
"\xda" => "\xd0\xaa", #U+042a CYRILLIC CAPITAL LETTER HARD SIGN
|
||
"\xdb" => "\xd0\xab", #U+042b CYRILLIC CAPITAL LETTER YERU
|
||
"\xdc" => "\xd0\xac", #U+042c CYRILLIC CAPITAL LETTER SOFT SIGN
|
||
"\xdd" => "\xd0\xad", #U+042d CYRILLIC CAPITAL LETTER E
|
||
"\xde" => "\xd0\xae", #U+042e CYRILLIC CAPITAL LETTER YU
|
||
"\xdf" => "\xd0\xaf", #U+042f CYRILLIC CAPITAL LETTER YA
|
||
"\xe0" => "\xd0\xb0", #U+0430 CYRILLIC SMALL LETTER A
|
||
"\xe1" => "\xd0\xb1", #U+0431 CYRILLIC SMALL LETTER BE
|
||
"\xe2" => "\xd0\xb2", #U+0432 CYRILLIC SMALL LETTER VE
|
||
"\xe3" => "\xd0\xb3", #U+0433 CYRILLIC SMALL LETTER GHE
|
||
"\xe4" => "\xd0\xb4", #U+0434 CYRILLIC SMALL LETTER DE
|
||
"\xe5" => "\xd0\xb5", #U+0435 CYRILLIC SMALL LETTER IE
|
||
"\xe6" => "\xd0\xb6", #U+0436 CYRILLIC SMALL LETTER ZHE
|
||
"\xe7" => "\xd0\xb7", #U+0437 CYRILLIC SMALL LETTER ZE
|
||
"\xe8" => "\xd0\xb8", #U+0438 CYRILLIC SMALL LETTER I
|
||
"\xe9" => "\xd0\xb9", #U+0439 CYRILLIC SMALL LETTER SHORT I
|
||
"\xea" => "\xd0\xba", #U+043a CYRILLIC SMALL LETTER KA
|
||
"\xeb" => "\xd0\xbb", #U+043b CYRILLIC SMALL LETTER EL
|
||
"\xec" => "\xd0\xbc", #U+043c CYRILLIC SMALL LETTER EM
|
||
"\xed" => "\xd0\xbd", #U+043d CYRILLIC SMALL LETTER EN
|
||
"\xee" => "\xd0\xbe", #U+043e CYRILLIC SMALL LETTER O
|
||
"\xef" => "\xd0\xbf", #U+043f CYRILLIC SMALL LETTER PE
|
||
"\xf0" => "\xd1\x80", #U+0440 CYRILLIC SMALL LETTER ER
|
||
"\xf1" => "\xd1\x81", #U+0441 CYRILLIC SMALL LETTER ES
|
||
"\xf2" => "\xd1\x82", #U+0442 CYRILLIC SMALL LETTER TE
|
||
"\xf3" => "\xd1\x83", #U+0443 CYRILLIC SMALL LETTER U
|
||
"\xf4" => "\xd1\x84", #U+0444 CYRILLIC SMALL LETTER EF
|
||
"\xf5" => "\xd1\x85", #U+0445 CYRILLIC SMALL LETTER HA
|
||
"\xf6" => "\xd1\x86", #U+0446 CYRILLIC SMALL LETTER TSE
|
||
"\xf7" => "\xd1\x87", #U+0447 CYRILLIC SMALL LETTER CHE
|
||
"\xf8" => "\xd1\x88", #U+0448 CYRILLIC SMALL LETTER SHA
|
||
"\xf9" => "\xd1\x89", #U+0449 CYRILLIC SMALL LETTER SHCHA
|
||
"\xfa" => "\xd1\x8a", #U+044a CYRILLIC SMALL LETTER HARD SIGN
|
||
"\xfb" => "\xd1\x8b", #U+044b CYRILLIC SMALL LETTER YERU
|
||
"\xfc" => "\xd1\x8c", #U+044c CYRILLIC SMALL LETTER SOFT SIGN
|
||
"\xfd" => "\xd1\x8d", #U+044d CYRILLIC SMALL LETTER E
|
||
"\xfe" => "\xd1\x8e", #U+044e CYRILLIC SMALL LETTER YU
|
||
"\xff" => "\xd1\x8f", #U+044f CYRILLIC SMALL LETTER YA
|
||
);
|
||
|
||
/**
|
||
* UTF-8 Case lookup table
|
||
*
|
||
* This lookuptable defines the upper case letters to their correspponding
|
||
* lower case letter in UTF-8
|
||
*
|
||
* @author Andreas Gohr <andi@splitbrain.org>
|
||
* @var array
|
||
*/
|
||
public static $convert_case_table = array(
|
||
#CASE_UPPER => case_lower
|
||
"\x41" => "\x61", #A a
|
||
"\x42" => "\x62", #B b
|
||
"\x43" => "\x63", #C c
|
||
"\x44" => "\x64", #D d
|
||
"\x45" => "\x65", #E e
|
||
"\x46" => "\x66", #F f
|
||
"\x47" => "\x67", #G g
|
||
"\x48" => "\x68", #H h
|
||
"\x49" => "\x69", #I i
|
||
"\x4a" => "\x6a", #J j
|
||
"\x4b" => "\x6b", #K k
|
||
"\x4c" => "\x6c", #L l
|
||
"\x4d" => "\x6d", #M m
|
||
"\x4e" => "\x6e", #N n
|
||
"\x4f" => "\x6f", #O o
|
||
"\x50" => "\x70", #P p
|
||
"\x51" => "\x71", #Q q
|
||
"\x52" => "\x72", #R r
|
||
"\x53" => "\x73", #S s
|
||
"\x54" => "\x74", #T t
|
||
"\x55" => "\x75", #U u
|
||
"\x56" => "\x76", #V v
|
||
"\x57" => "\x77", #W w
|
||
"\x58" => "\x78", #X x
|
||
"\x59" => "\x79", #Y y
|
||
"\x5a" => "\x7a", #Z z
|
||
"\xc3\x80" => "\xc3\xa0",
|
||
"\xc3\x81" => "\xc3\xa1",
|
||
"\xc3\x82" => "\xc3\xa2",
|
||
"\xc3\x83" => "\xc3\xa3",
|
||
"\xc3\x84" => "\xc3\xa4",
|
||
"\xc3\x85" => "\xc3\xa5",
|
||
"\xc3\x86" => "\xc3\xa6",
|
||
"\xc3\x87" => "\xc3\xa7",
|
||
"\xc3\x88" => "\xc3\xa8",
|
||
"\xc3\x89" => "\xc3\xa9",
|
||
"\xc3\x8a" => "\xc3\xaa",
|
||
"\xc3\x8b" => "\xc3\xab",
|
||
"\xc3\x8c" => "\xc3\xac",
|
||
"\xc3\x8d" => "\xc3\xad",
|
||
"\xc3\x8e" => "\xc3\xae",
|
||
"\xc3\x8f" => "\xc3\xaf",
|
||
"\xc3\x90" => "\xc3\xb0",
|
||
"\xc3\x91" => "\xc3\xb1",
|
||
"\xc3\x92" => "\xc3\xb2",
|
||
"\xc3\x93" => "\xc3\xb3",
|
||
"\xc3\x94" => "\xc3\xb4",
|
||
"\xc3\x95" => "\xc3\xb5",
|
||
"\xc3\x96" => "\xc3\xb6",
|
||
"\xc3\x98" => "\xc3\xb8",
|
||
"\xc3\x99" => "\xc3\xb9",
|
||
"\xc3\x9a" => "\xc3\xba",
|
||
"\xc3\x9b" => "\xc3\xbb",
|
||
"\xc3\x9c" => "\xc3\xbc",
|
||
"\xc3\x9d" => "\xc3\xbd",
|
||
"\xc3\x9e" => "\xc3\xbe",
|
||
"\xc4\x80" => "\xc4\x81",
|
||
"\xc4\x82" => "\xc4\x83",
|
||
"\xc4\x84" => "\xc4\x85",
|
||
"\xc4\x86" => "\xc4\x87",
|
||
"\xc4\x88" => "\xc4\x89",
|
||
"\xc4\x8a" => "\xc4\x8b",
|
||
"\xc4\x8c" => "\xc4\x8d",
|
||
"\xc4\x8e" => "\xc4\x8f",
|
||
"\xc4\x90" => "\xc4\x91",
|
||
"\xc4\x92" => "\xc4\x93",
|
||
"\xc4\x94" => "\xc4\x95",
|
||
"\xc4\x96" => "\xc4\x97",
|
||
"\xc4\x98" => "\xc4\x99",
|
||
"\xc4\x9a" => "\xc4\x9b",
|
||
"\xc4\x9c" => "\xc4\x9d",
|
||
"\xc4\x9e" => "\xc4\x9f",
|
||
"\xc4\xa0" => "\xc4\xa1",
|
||
"\xc4\xa2" => "\xc4\xa3",
|
||
"\xc4\xa4" => "\xc4\xa5",
|
||
"\xc4\xa6" => "\xc4\xa7",
|
||
"\xc4\xa8" => "\xc4\xa9",
|
||
"\xc4\xaa" => "\xc4\xab",
|
||
"\xc4\xac" => "\xc4\xad",
|
||
"\xc4\xae" => "\xc4\xaf",
|
||
"\xc4\xb2" => "\xc4\xb3",
|
||
"\xc4\xb4" => "\xc4\xb5",
|
||
"\xc4\xb6" => "\xc4\xb7",
|
||
"\xc4\xb9" => "\xc4\xba",
|
||
"\xc4\xbb" => "\xc4\xbc",
|
||
"\xc4\xbd" => "\xc4\xbe",
|
||
"\xc4\xbf" => "\xc5\x80",
|
||
"\xc5\x81" => "\xc5\x82",
|
||
"\xc5\x83" => "\xc5\x84",
|
||
"\xc5\x85" => "\xc5\x86",
|
||
"\xc5\x87" => "\xc5\x88",
|
||
"\xc5\x8a" => "\xc5\x8b",
|
||
"\xc5\x8c" => "\xc5\x8d",
|
||
"\xc5\x8e" => "\xc5\x8f",
|
||
"\xc5\x90" => "\xc5\x91",
|
||
"\xc5\x92" => "\xc5\x93",
|
||
"\xc5\x94" => "\xc5\x95",
|
||
"\xc5\x96" => "\xc5\x97",
|
||
"\xc5\x98" => "\xc5\x99",
|
||
"\xc5\x9a" => "\xc5\x9b",
|
||
"\xc5\x9c" => "\xc5\x9d",
|
||
"\xc5\x9e" => "\xc5\x9f",
|
||
"\xc5\xa0" => "\xc5\xa1",
|
||
"\xc5\xa2" => "\xc5\xa3",
|
||
"\xc5\xa4" => "\xc5\xa5",
|
||
"\xc5\xa6" => "\xc5\xa7",
|
||
"\xc5\xa8" => "\xc5\xa9",
|
||
"\xc5\xaa" => "\xc5\xab",
|
||
"\xc5\xac" => "\xc5\xad",
|
||
"\xc5\xae" => "\xc5\xaf",
|
||
"\xc5\xb0" => "\xc5\xb1",
|
||
"\xc5\xb2" => "\xc5\xb3",
|
||
"\xc5\xb4" => "\xc5\xb5",
|
||
"\xc5\xb6" => "\xc5\xb7",
|
||
"\xc5\xb8" => "\xc3\xbf",
|
||
"\xc5\xb9" => "\xc5\xba",
|
||
"\xc5\xbb" => "\xc5\xbc",
|
||
"\xc5\xbd" => "\xc5\xbe",
|
||
"\xc6\x81" => "\xc9\x93",
|
||
"\xc6\x82" => "\xc6\x83",
|
||
"\xc6\x84" => "\xc6\x85",
|
||
"\xc6\x86" => "\xc9\x94",
|
||
"\xc6\x87" => "\xc6\x88",
|
||
"\xc6\x89" => "\xc9\x96",
|
||
"\xc6\x8a" => "\xc9\x97",
|
||
"\xc6\x8b" => "\xc6\x8c",
|
||
"\xc6\x8e" => "\xc7\x9d",
|
||
"\xc6\x8f" => "\xc9\x99",
|
||
"\xc6\x90" => "\xc9\x9b",
|
||
"\xc6\x91" => "\xc6\x92",
|
||
"\xc6\x94" => "\xc9\xa3",
|
||
"\xc6\x96" => "\xc9\xa9",
|
||
"\xc6\x97" => "\xc9\xa8",
|
||
"\xc6\x98" => "\xc6\x99",
|
||
"\xc6\x9c" => "\xc9\xaf",
|
||
"\xc6\x9d" => "\xc9\xb2",
|
||
"\xc6\x9f" => "\xc9\xb5",
|
||
"\xc6\xa0" => "\xc6\xa1",
|
||
"\xc6\xa2" => "\xc6\xa3",
|
||
"\xc6\xa4" => "\xc6\xa5",
|
||
"\xc6\xa6" => "\xca\x80",
|
||
"\xc6\xa7" => "\xc6\xa8",
|
||
"\xc6\xa9" => "\xca\x83",
|
||
"\xc6\xac" => "\xc6\xad",
|
||
"\xc6\xae" => "\xca\x88",
|
||
"\xc6\xaf" => "\xc6\xb0",
|
||
"\xc6\xb1" => "\xca\x8a",
|
||
"\xc6\xb2" => "\xca\x8b",
|
||
"\xc6\xb3" => "\xc6\xb4",
|
||
"\xc6\xb5" => "\xc6\xb6",
|
||
"\xc6\xb7" => "\xca\x92",
|
||
"\xc6\xb8" => "\xc6\xb9",
|
||
"\xc6\xbc" => "\xc6\xbd",
|
||
"\xc7\x85" => "\xc7\x86",
|
||
"\xc7\x88" => "\xc7\x89",
|
||
"\xc7\x8b" => "\xc7\x8c",
|
||
"\xc7\x8d" => "\xc7\x8e",
|
||
"\xc7\x8f" => "\xc7\x90",
|
||
"\xc7\x91" => "\xc7\x92",
|
||
"\xc7\x93" => "\xc7\x94",
|
||
"\xc7\x95" => "\xc7\x96",
|
||
"\xc7\x97" => "\xc7\x98",
|
||
"\xc7\x99" => "\xc7\x9a",
|
||
"\xc7\x9b" => "\xc7\x9c",
|
||
"\xc7\x9e" => "\xc7\x9f",
|
||
"\xc7\xa0" => "\xc7\xa1",
|
||
"\xc7\xa2" => "\xc7\xa3",
|
||
"\xc7\xa4" => "\xc7\xa5",
|
||
"\xc7\xa6" => "\xc7\xa7",
|
||
"\xc7\xa8" => "\xc7\xa9",
|
||
"\xc7\xaa" => "\xc7\xab",
|
||
"\xc7\xac" => "\xc7\xad",
|
||
"\xc7\xae" => "\xc7\xaf",
|
||
"\xc7\xb2" => "\xc7\xb3",
|
||
"\xc7\xb4" => "\xc7\xb5",
|
||
"\xc7\xb6" => "\xc6\x95",
|
||
"\xc7\xb7" => "\xc6\xbf",
|
||
"\xc7\xb8" => "\xc7\xb9",
|
||
"\xc7\xba" => "\xc7\xbb",
|
||
"\xc7\xbc" => "\xc7\xbd",
|
||
"\xc7\xbe" => "\xc7\xbf",
|
||
"\xc8\x80" => "\xc8\x81",
|
||
"\xc8\x82" => "\xc8\x83",
|
||
"\xc8\x84" => "\xc8\x85",
|
||
"\xc8\x86" => "\xc8\x87",
|
||
"\xc8\x88" => "\xc8\x89",
|
||
"\xc8\x8a" => "\xc8\x8b",
|
||
"\xc8\x8c" => "\xc8\x8d",
|
||
"\xc8\x8e" => "\xc8\x8f",
|
||
"\xc8\x90" => "\xc8\x91",
|
||
"\xc8\x92" => "\xc8\x93",
|
||
"\xc8\x94" => "\xc8\x95",
|
||
"\xc8\x96" => "\xc8\x97",
|
||
"\xc8\x98" => "\xc8\x99",
|
||
"\xc8\x9a" => "\xc8\x9b",
|
||
"\xc8\x9c" => "\xc8\x9d",
|
||
"\xc8\x9e" => "\xc8\x9f",
|
||
"\xc8\xa0" => "\xc6\x9e",
|
||
"\xc8\xa2" => "\xc8\xa3",
|
||
"\xc8\xa4" => "\xc8\xa5",
|
||
"\xc8\xa6" => "\xc8\xa7",
|
||
"\xc8\xa8" => "\xc8\xa9",
|
||
"\xc8\xaa" => "\xc8\xab",
|
||
"\xc8\xac" => "\xc8\xad",
|
||
"\xc8\xae" => "\xc8\xaf",
|
||
"\xc8\xb0" => "\xc8\xb1",
|
||
"\xc8\xb2" => "\xc8\xb3",
|
||
"\xce\x86" => "\xce\xac",
|
||
"\xce\x88" => "\xce\xad",
|
||
"\xce\x89" => "\xce\xae",
|
||
"\xce\x8a" => "\xce\xaf",
|
||
"\xce\x8c" => "\xcf\x8c",
|
||
"\xce\x8e" => "\xcf\x8d",
|
||
"\xce\x8f" => "\xcf\x8e",
|
||
"\xce\x91" => "\xce\xb1",
|
||
"\xce\x92" => "\xce\xb2",
|
||
"\xce\x93" => "\xce\xb3",
|
||
"\xce\x94" => "\xce\xb4",
|
||
"\xce\x95" => "\xce\xb5",
|
||
"\xce\x96" => "\xce\xb6",
|
||
"\xce\x97" => "\xce\xb7",
|
||
"\xce\x98" => "\xce\xb8",
|
||
"\xce\x99" => "\xce\xb9",
|
||
"\xce\x9a" => "\xce\xba",
|
||
"\xce\x9b" => "\xce\xbb",
|
||
"\xce\x9c" => "\xc2\xb5",
|
||
"\xce\x9d" => "\xce\xbd",
|
||
"\xce\x9e" => "\xce\xbe",
|
||
"\xce\x9f" => "\xce\xbf",
|
||
"\xce\xa0" => "\xcf\x80",
|
||
"\xce\xa1" => "\xcf\x81",
|
||
"\xce\xa3" => "\xcf\x82",
|
||
"\xce\xa4" => "\xcf\x84",
|
||
"\xce\xa5" => "\xcf\x85",
|
||
"\xce\xa6" => "\xcf\x86",
|
||
"\xce\xa7" => "\xcf\x87",
|
||
"\xce\xa8" => "\xcf\x88",
|
||
"\xce\xa9" => "\xcf\x89",
|
||
"\xce\xaa" => "\xcf\x8a",
|
||
"\xce\xab" => "\xcf\x8b",
|
||
"\xcf\x98" => "\xcf\x99",
|
||
"\xcf\x9a" => "\xcf\x9b",
|
||
"\xcf\x9c" => "\xcf\x9d",
|
||
"\xcf\x9e" => "\xcf\x9f",
|
||
"\xcf\xa0" => "\xcf\xa1",
|
||
"\xcf\xa2" => "\xcf\xa3",
|
||
"\xcf\xa4" => "\xcf\xa5",
|
||
"\xcf\xa6" => "\xcf\xa7",
|
||
"\xcf\xa8" => "\xcf\xa9",
|
||
"\xcf\xaa" => "\xcf\xab",
|
||
"\xcf\xac" => "\xcf\xad",
|
||
"\xcf\xae" => "\xcf\xaf",
|
||
"\xd0\x80" => "\xd1\x90",
|
||
"\xd0\x81" => "\xd1\x91",
|
||
"\xd0\x82" => "\xd1\x92",
|
||
"\xd0\x83" => "\xd1\x93",
|
||
"\xd0\x84" => "\xd1\x94",
|
||
"\xd0\x85" => "\xd1\x95",
|
||
"\xd0\x86" => "\xd1\x96",
|
||
"\xd0\x87" => "\xd1\x97",
|
||
"\xd0\x88" => "\xd1\x98",
|
||
"\xd0\x89" => "\xd1\x99",
|
||
"\xd0\x8a" => "\xd1\x9a",
|
||
"\xd0\x8b" => "\xd1\x9b",
|
||
"\xd0\x8c" => "\xd1\x9c",
|
||
"\xd0\x8d" => "\xd1\x9d",
|
||
"\xd0\x8e" => "\xd1\x9e",
|
||
"\xd0\x8f" => "\xd1\x9f",
|
||
"\xd0\x90" => "\xd0\xb0",
|
||
"\xd0\x91" => "\xd0\xb1",
|
||
"\xd0\x92" => "\xd0\xb2",
|
||
"\xd0\x93" => "\xd0\xb3",
|
||
"\xd0\x94" => "\xd0\xb4",
|
||
"\xd0\x95" => "\xd0\xb5",
|
||
"\xd0\x96" => "\xd0\xb6",
|
||
"\xd0\x97" => "\xd0\xb7",
|
||
"\xd0\x98" => "\xd0\xb8",
|
||
"\xd0\x99" => "\xd0\xb9",
|
||
"\xd0\x9a" => "\xd0\xba",
|
||
"\xd0\x9b" => "\xd0\xbb",
|
||
"\xd0\x9c" => "\xd0\xbc",
|
||
"\xd0\x9d" => "\xd0\xbd",
|
||
"\xd0\x9e" => "\xd0\xbe",
|
||
"\xd0\x9f" => "\xd0\xbf",
|
||
"\xd0\xa0" => "\xd1\x80",
|
||
"\xd0\xa1" => "\xd1\x81",
|
||
"\xd0\xa2" => "\xd1\x82",
|
||
"\xd0\xa3" => "\xd1\x83",
|
||
"\xd0\xa4" => "\xd1\x84",
|
||
"\xd0\xa5" => "\xd1\x85",
|
||
"\xd0\xa6" => "\xd1\x86",
|
||
"\xd0\xa7" => "\xd1\x87",
|
||
"\xd0\xa8" => "\xd1\x88",
|
||
"\xd0\xa9" => "\xd1\x89",
|
||
"\xd0\xaa" => "\xd1\x8a",
|
||
"\xd0\xab" => "\xd1\x8b",
|
||
"\xd0\xac" => "\xd1\x8c",
|
||
"\xd0\xad" => "\xd1\x8d",
|
||
"\xd0\xae" => "\xd1\x8e",
|
||
"\xd0\xaf" => "\xd1\x8f",
|
||
"\xd1\xa0" => "\xd1\xa1",
|
||
"\xd1\xa2" => "\xd1\xa3",
|
||
"\xd1\xa4" => "\xd1\xa5",
|
||
"\xd1\xa6" => "\xd1\xa7",
|
||
"\xd1\xa8" => "\xd1\xa9",
|
||
"\xd1\xaa" => "\xd1\xab",
|
||
"\xd1\xac" => "\xd1\xad",
|
||
"\xd1\xae" => "\xd1\xaf",
|
||
"\xd1\xb0" => "\xd1\xb1",
|
||
"\xd1\xb2" => "\xd1\xb3",
|
||
"\xd1\xb4" => "\xd1\xb5",
|
||
"\xd1\xb6" => "\xd1\xb7",
|
||
"\xd1\xb8" => "\xd1\xb9",
|
||
"\xd1\xba" => "\xd1\xbb",
|
||
"\xd1\xbc" => "\xd1\xbd",
|
||
"\xd1\xbe" => "\xd1\xbf",
|
||
"\xd2\x80" => "\xd2\x81",
|
||
"\xd2\x8a" => "\xd2\x8b",
|
||
"\xd2\x8c" => "\xd2\x8d",
|
||
"\xd2\x8e" => "\xd2\x8f",
|
||
"\xd2\x90" => "\xd2\x91",
|
||
"\xd2\x92" => "\xd2\x93",
|
||
"\xd2\x94" => "\xd2\x95",
|
||
"\xd2\x96" => "\xd2\x97",
|
||
"\xd2\x98" => "\xd2\x99",
|
||
"\xd2\x9a" => "\xd2\x9b",
|
||
"\xd2\x9c" => "\xd2\x9d",
|
||
"\xd2\x9e" => "\xd2\x9f",
|
||
"\xd2\xa0" => "\xd2\xa1",
|
||
"\xd2\xa2" => "\xd2\xa3",
|
||
"\xd2\xa4" => "\xd2\xa5",
|
||
"\xd2\xa6" => "\xd2\xa7",
|
||
"\xd2\xa8" => "\xd2\xa9",
|
||
"\xd2\xaa" => "\xd2\xab",
|
||
"\xd2\xac" => "\xd2\xad",
|
||
"\xd2\xae" => "\xd2\xaf",
|
||
"\xd2\xb0" => "\xd2\xb1",
|
||
"\xd2\xb2" => "\xd2\xb3",
|
||
"\xd2\xb4" => "\xd2\xb5",
|
||
"\xd2\xb6" => "\xd2\xb7",
|
||
"\xd2\xb8" => "\xd2\xb9",
|
||
"\xd2\xba" => "\xd2\xbb",
|
||
"\xd2\xbc" => "\xd2\xbd",
|
||
"\xd2\xbe" => "\xd2\xbf",
|
||
"\xd3\x81" => "\xd3\x82",
|
||
"\xd3\x83" => "\xd3\x84",
|
||
"\xd3\x85" => "\xd3\x86",
|
||
"\xd3\x87" => "\xd3\x88",
|
||
"\xd3\x89" => "\xd3\x8a",
|
||
"\xd3\x8b" => "\xd3\x8c",
|
||
"\xd3\x8d" => "\xd3\x8e",
|
||
"\xd3\x90" => "\xd3\x91",
|
||
"\xd3\x92" => "\xd3\x93",
|
||
"\xd3\x94" => "\xd3\x95",
|
||
"\xd3\x96" => "\xd3\x97",
|
||
"\xd3\x98" => "\xd3\x99",
|
||
"\xd3\x9a" => "\xd3\x9b",
|
||
"\xd3\x9c" => "\xd3\x9d",
|
||
"\xd3\x9e" => "\xd3\x9f",
|
||
"\xd3\xa0" => "\xd3\xa1",
|
||
"\xd3\xa2" => "\xd3\xa3",
|
||
"\xd3\xa4" => "\xd3\xa5",
|
||
"\xd3\xa6" => "\xd3\xa7",
|
||
"\xd3\xa8" => "\xd3\xa9",
|
||
"\xd3\xaa" => "\xd3\xab",
|
||
"\xd3\xac" => "\xd3\xad",
|
||
"\xd3\xae" => "\xd3\xaf",
|
||
"\xd3\xb0" => "\xd3\xb1",
|
||
"\xd3\xb2" => "\xd3\xb3",
|
||
"\xd3\xb4" => "\xd3\xb5",
|
||
"\xd3\xb8" => "\xd3\xb9",
|
||
"\xd4\x80" => "\xd4\x81",
|
||
"\xd4\x82" => "\xd4\x83",
|
||
"\xd4\x84" => "\xd4\x85",
|
||
"\xd4\x86" => "\xd4\x87",
|
||
"\xd4\x88" => "\xd4\x89",
|
||
"\xd4\x8a" => "\xd4\x8b",
|
||
"\xd4\x8c" => "\xd4\x8d",
|
||
"\xd4\x8e" => "\xd4\x8f",
|
||
"\xd4\xb1" => "\xd5\xa1",
|
||
"\xd4\xb2" => "\xd5\xa2",
|
||
"\xd4\xb3" => "\xd5\xa3",
|
||
"\xd4\xb4" => "\xd5\xa4",
|
||
"\xd4\xb5" => "\xd5\xa5",
|
||
"\xd4\xb6" => "\xd5\xa6",
|
||
"\xd4\xb7" => "\xd5\xa7",
|
||
"\xd4\xb8" => "\xd5\xa8",
|
||
"\xd4\xb9" => "\xd5\xa9",
|
||
"\xd4\xba" => "\xd5\xaa",
|
||
"\xd4\xbb" => "\xd5\xab",
|
||
"\xd4\xbc" => "\xd5\xac",
|
||
"\xd4\xbd" => "\xd5\xad",
|
||
"\xd4\xbe" => "\xd5\xae",
|
||
"\xd4\xbf" => "\xd5\xaf",
|
||
"\xd5\x80" => "\xd5\xb0",
|
||
"\xd5\x81" => "\xd5\xb1",
|
||
"\xd5\x82" => "\xd5\xb2",
|
||
"\xd5\x83" => "\xd5\xb3",
|
||
"\xd5\x84" => "\xd5\xb4",
|
||
"\xd5\x85" => "\xd5\xb5",
|
||
"\xd5\x86" => "\xd5\xb6",
|
||
"\xd5\x87" => "\xd5\xb7",
|
||
"\xd5\x88" => "\xd5\xb8",
|
||
"\xd5\x89" => "\xd5\xb9",
|
||
"\xd5\x8a" => "\xd5\xba",
|
||
"\xd5\x8b" => "\xd5\xbb",
|
||
"\xd5\x8c" => "\xd5\xbc",
|
||
"\xd5\x8d" => "\xd5\xbd",
|
||
"\xd5\x8e" => "\xd5\xbe",
|
||
"\xd5\x8f" => "\xd5\xbf",
|
||
"\xd5\x90" => "\xd6\x80",
|
||
"\xd5\x91" => "\xd6\x81",
|
||
"\xd5\x92" => "\xd6\x82",
|
||
"\xd5\x93" => "\xd6\x83",
|
||
"\xd5\x94" => "\xd6\x84",
|
||
"\xd5\x95" => "\xd6\x85",
|
||
"\xd5\x96" => "\xd6\x86",
|
||
"\xe1\xb8\x80" => "\xe1\xb8\x81",
|
||
"\xe1\xb8\x82" => "\xe1\xb8\x83",
|
||
"\xe1\xb8\x84" => "\xe1\xb8\x85",
|
||
"\xe1\xb8\x86" => "\xe1\xb8\x87",
|
||
"\xe1\xb8\x88" => "\xe1\xb8\x89",
|
||
"\xe1\xb8\x8a" => "\xe1\xb8\x8b",
|
||
"\xe1\xb8\x8c" => "\xe1\xb8\x8d",
|
||
"\xe1\xb8\x8e" => "\xe1\xb8\x8f",
|
||
"\xe1\xb8\x90" => "\xe1\xb8\x91",
|
||
"\xe1\xb8\x92" => "\xe1\xb8\x93",
|
||
"\xe1\xb8\x94" => "\xe1\xb8\x95",
|
||
"\xe1\xb8\x96" => "\xe1\xb8\x97",
|
||
"\xe1\xb8\x98" => "\xe1\xb8\x99",
|
||
"\xe1\xb8\x9a" => "\xe1\xb8\x9b",
|
||
"\xe1\xb8\x9c" => "\xe1\xb8\x9d",
|
||
"\xe1\xb8\x9e" => "\xe1\xb8\x9f",
|
||
"\xe1\xb8\xa0" => "\xe1\xb8\xa1",
|
||
"\xe1\xb8\xa2" => "\xe1\xb8\xa3",
|
||
"\xe1\xb8\xa4" => "\xe1\xb8\xa5",
|
||
"\xe1\xb8\xa6" => "\xe1\xb8\xa7",
|
||
"\xe1\xb8\xa8" => "\xe1\xb8\xa9",
|
||
"\xe1\xb8\xaa" => "\xe1\xb8\xab",
|
||
"\xe1\xb8\xac" => "\xe1\xb8\xad",
|
||
"\xe1\xb8\xae" => "\xe1\xb8\xaf",
|
||
"\xe1\xb8\xb0" => "\xe1\xb8\xb1",
|
||
"\xe1\xb8\xb2" => "\xe1\xb8\xb3",
|
||
"\xe1\xb8\xb4" => "\xe1\xb8\xb5",
|
||
"\xe1\xb8\xb6" => "\xe1\xb8\xb7",
|
||
"\xe1\xb8\xb8" => "\xe1\xb8\xb9",
|
||
"\xe1\xb8\xba" => "\xe1\xb8\xbb",
|
||
"\xe1\xb8\xbc" => "\xe1\xb8\xbd",
|
||
"\xe1\xb8\xbe" => "\xe1\xb8\xbf",
|
||
"\xe1\xb9\x80" => "\xe1\xb9\x81",
|
||
"\xe1\xb9\x82" => "\xe1\xb9\x83",
|
||
"\xe1\xb9\x84" => "\xe1\xb9\x85",
|
||
"\xe1\xb9\x86" => "\xe1\xb9\x87",
|
||
"\xe1\xb9\x88" => "\xe1\xb9\x89",
|
||
"\xe1\xb9\x8a" => "\xe1\xb9\x8b",
|
||
"\xe1\xb9\x8c" => "\xe1\xb9\x8d",
|
||
"\xe1\xb9\x8e" => "\xe1\xb9\x8f",
|
||
"\xe1\xb9\x90" => "\xe1\xb9\x91",
|
||
"\xe1\xb9\x92" => "\xe1\xb9\x93",
|
||
"\xe1\xb9\x94" => "\xe1\xb9\x95",
|
||
"\xe1\xb9\x96" => "\xe1\xb9\x97",
|
||
"\xe1\xb9\x98" => "\xe1\xb9\x99",
|
||
"\xe1\xb9\x9a" => "\xe1\xb9\x9b",
|
||
"\xe1\xb9\x9c" => "\xe1\xb9\x9d",
|
||
"\xe1\xb9\x9e" => "\xe1\xb9\x9f",
|
||
"\xe1\xb9\xa0" => "\xe1\xb9\xa1",
|
||
"\xe1\xb9\xa2" => "\xe1\xb9\xa3",
|
||
"\xe1\xb9\xa4" => "\xe1\xb9\xa5",
|
||
"\xe1\xb9\xa6" => "\xe1\xb9\xa7",
|
||
"\xe1\xb9\xa8" => "\xe1\xb9\xa9",
|
||
"\xe1\xb9\xaa" => "\xe1\xb9\xab",
|
||
"\xe1\xb9\xac" => "\xe1\xb9\xad",
|
||
"\xe1\xb9\xae" => "\xe1\xb9\xaf",
|
||
"\xe1\xb9\xb0" => "\xe1\xb9\xb1",
|
||
"\xe1\xb9\xb2" => "\xe1\xb9\xb3",
|
||
"\xe1\xb9\xb4" => "\xe1\xb9\xb5",
|
||
"\xe1\xb9\xb6" => "\xe1\xb9\xb7",
|
||
"\xe1\xb9\xb8" => "\xe1\xb9\xb9",
|
||
"\xe1\xb9\xba" => "\xe1\xb9\xbb",
|
||
"\xe1\xb9\xbc" => "\xe1\xb9\xbd",
|
||
"\xe1\xb9\xbe" => "\xe1\xb9\xbf",
|
||
"\xe1\xba\x80" => "\xe1\xba\x81",
|
||
"\xe1\xba\x82" => "\xe1\xba\x83",
|
||
"\xe1\xba\x84" => "\xe1\xba\x85",
|
||
"\xe1\xba\x86" => "\xe1\xba\x87",
|
||
"\xe1\xba\x88" => "\xe1\xba\x89",
|
||
"\xe1\xba\x8a" => "\xe1\xba\x8b",
|
||
"\xe1\xba\x8c" => "\xe1\xba\x8d",
|
||
"\xe1\xba\x8e" => "\xe1\xba\x8f",
|
||
"\xe1\xba\x90" => "\xe1\xba\x91",
|
||
"\xe1\xba\x92" => "\xe1\xba\x93",
|
||
"\xe1\xba\x94" => "\xe1\xba\x95",
|
||
"\xe1\xba\xa0" => "\xe1\xba\xa1",
|
||
"\xe1\xba\xa2" => "\xe1\xba\xa3",
|
||
"\xe1\xba\xa4" => "\xe1\xba\xa5",
|
||
"\xe1\xba\xa6" => "\xe1\xba\xa7",
|
||
"\xe1\xba\xa8" => "\xe1\xba\xa9",
|
||
"\xe1\xba\xaa" => "\xe1\xba\xab",
|
||
"\xe1\xba\xac" => "\xe1\xba\xad",
|
||
"\xe1\xba\xae" => "\xe1\xba\xaf",
|
||
"\xe1\xba\xb0" => "\xe1\xba\xb1",
|
||
"\xe1\xba\xb2" => "\xe1\xba\xb3",
|
||
"\xe1\xba\xb4" => "\xe1\xba\xb5",
|
||
"\xe1\xba\xb6" => "\xe1\xba\xb7",
|
||
"\xe1\xba\xb8" => "\xe1\xba\xb9",
|
||
"\xe1\xba\xba" => "\xe1\xba\xbb",
|
||
"\xe1\xba\xbc" => "\xe1\xba\xbd",
|
||
"\xe1\xba\xbe" => "\xe1\xba\xbf",
|
||
"\xe1\xbb\x80" => "\xe1\xbb\x81",
|
||
"\xe1\xbb\x82" => "\xe1\xbb\x83",
|
||
"\xe1\xbb\x84" => "\xe1\xbb\x85",
|
||
"\xe1\xbb\x86" => "\xe1\xbb\x87",
|
||
"\xe1\xbb\x88" => "\xe1\xbb\x89",
|
||
"\xe1\xbb\x8a" => "\xe1\xbb\x8b",
|
||
"\xe1\xbb\x8c" => "\xe1\xbb\x8d",
|
||
"\xe1\xbb\x8e" => "\xe1\xbb\x8f",
|
||
"\xe1\xbb\x90" => "\xe1\xbb\x91",
|
||
"\xe1\xbb\x92" => "\xe1\xbb\x93",
|
||
"\xe1\xbb\x94" => "\xe1\xbb\x95",
|
||
"\xe1\xbb\x96" => "\xe1\xbb\x97",
|
||
"\xe1\xbb\x98" => "\xe1\xbb\x99",
|
||
"\xe1\xbb\x9a" => "\xe1\xbb\x9b",
|
||
"\xe1\xbb\x9c" => "\xe1\xbb\x9d",
|
||
"\xe1\xbb\x9e" => "\xe1\xbb\x9f",
|
||
"\xe1\xbb\xa0" => "\xe1\xbb\xa1",
|
||
"\xe1\xbb\xa2" => "\xe1\xbb\xa3",
|
||
"\xe1\xbb\xa4" => "\xe1\xbb\xa5",
|
||
"\xe1\xbb\xa6" => "\xe1\xbb\xa7",
|
||
"\xe1\xbb\xa8" => "\xe1\xbb\xa9",
|
||
"\xe1\xbb\xaa" => "\xe1\xbb\xab",
|
||
"\xe1\xbb\xac" => "\xe1\xbb\xad",
|
||
"\xe1\xbb\xae" => "\xe1\xbb\xaf",
|
||
"\xe1\xbb\xb0" => "\xe1\xbb\xb1",
|
||
"\xe1\xbb\xb2" => "\xe1\xbb\xb3",
|
||
"\xe1\xbb\xb4" => "\xe1\xbb\xb5",
|
||
"\xe1\xbb\xb6" => "\xe1\xbb\xb7",
|
||
"\xe1\xbb\xb8" => "\xe1\xbb\xb9",
|
||
"\xe1\xbc\x88" => "\xe1\xbc\x80",
|
||
"\xe1\xbc\x89" => "\xe1\xbc\x81",
|
||
"\xe1\xbc\x8a" => "\xe1\xbc\x82",
|
||
"\xe1\xbc\x8b" => "\xe1\xbc\x83",
|
||
"\xe1\xbc\x8c" => "\xe1\xbc\x84",
|
||
"\xe1\xbc\x8d" => "\xe1\xbc\x85",
|
||
"\xe1\xbc\x8e" => "\xe1\xbc\x86",
|
||
"\xe1\xbc\x8f" => "\xe1\xbc\x87",
|
||
"\xe1\xbc\x98" => "\xe1\xbc\x90",
|
||
"\xe1\xbc\x99" => "\xe1\xbc\x91",
|
||
"\xe1\xbc\x9a" => "\xe1\xbc\x92",
|
||
"\xe1\xbc\x9b" => "\xe1\xbc\x93",
|
||
"\xe1\xbc\x9c" => "\xe1\xbc\x94",
|
||
"\xe1\xbc\x9d" => "\xe1\xbc\x95",
|
||
"\xe1\xbc\xa9" => "\xe1\xbc\xa1",
|
||
"\xe1\xbc\xaa" => "\xe1\xbc\xa2",
|
||
"\xe1\xbc\xab" => "\xe1\xbc\xa3",
|
||
"\xe1\xbc\xac" => "\xe1\xbc\xa4",
|
||
"\xe1\xbc\xad" => "\xe1\xbc\xa5",
|
||
"\xe1\xbc\xae" => "\xe1\xbc\xa6",
|
||
"\xe1\xbc\xaf" => "\xe1\xbc\xa7",
|
||
"\xe1\xbc\xb8" => "\xe1\xbc\xb0",
|
||
"\xe1\xbc\xb9" => "\xe1\xbc\xb1",
|
||
"\xe1\xbc\xba" => "\xe1\xbc\xb2",
|
||
"\xe1\xbc\xbb" => "\xe1\xbc\xb3",
|
||
"\xe1\xbc\xbc" => "\xe1\xbc\xb4",
|
||
"\xe1\xbc\xbd" => "\xe1\xbc\xb5",
|
||
"\xe1\xbc\xbe" => "\xe1\xbc\xb6",
|
||
"\xe1\xbc\xbf" => "\xe1\xbc\xb7",
|
||
"\xe1\xbd\x88" => "\xe1\xbd\x80",
|
||
"\xe1\xbd\x89" => "\xe1\xbd\x81",
|
||
"\xe1\xbd\x8a" => "\xe1\xbd\x82",
|
||
"\xe1\xbd\x8b" => "\xe1\xbd\x83",
|
||
"\xe1\xbd\x8c" => "\xe1\xbd\x84",
|
||
"\xe1\xbd\x8d" => "\xe1\xbd\x85",
|
||
"\xe1\xbd\x99" => "\xe1\xbd\x91",
|
||
"\xe1\xbd\x9b" => "\xe1\xbd\x93",
|
||
"\xe1\xbd\x9d" => "\xe1\xbd\x95",
|
||
"\xe1\xbd\x9f" => "\xe1\xbd\x97",
|
||
"\xe1\xbd\xa9" => "\xe1\xbd\xa1",
|
||
"\xe1\xbd\xaa" => "\xe1\xbd\xa2",
|
||
"\xe1\xbd\xab" => "\xe1\xbd\xa3",
|
||
"\xe1\xbd\xac" => "\xe1\xbd\xa4",
|
||
"\xe1\xbd\xad" => "\xe1\xbd\xa5",
|
||
"\xe1\xbd\xae" => "\xe1\xbd\xa6",
|
||
"\xe1\xbd\xaf" => "\xe1\xbd\xa7",
|
||
"\xe1\xbe\x88" => "\xe1\xbe\x80",
|
||
"\xe1\xbe\x89" => "\xe1\xbe\x81",
|
||
"\xe1\xbe\x8a" => "\xe1\xbe\x82",
|
||
"\xe1\xbe\x8b" => "\xe1\xbe\x83",
|
||
"\xe1\xbe\x8c" => "\xe1\xbe\x84",
|
||
"\xe1\xbe\x8d" => "\xe1\xbe\x85",
|
||
"\xe1\xbe\x8e" => "\xe1\xbe\x86",
|
||
"\xe1\xbe\x8f" => "\xe1\xbe\x87",
|
||
"\xe1\xbe\x98" => "\xe1\xbe\x90",
|
||
"\xe1\xbe\x99" => "\xe1\xbe\x91",
|
||
"\xe1\xbe\x9a" => "\xe1\xbe\x92",
|
||
"\xe1\xbe\x9b" => "\xe1\xbe\x93",
|
||
"\xe1\xbe\x9c" => "\xe1\xbe\x94",
|
||
"\xe1\xbe\x9d" => "\xe1\xbe\x95",
|
||
"\xe1\xbe\x9e" => "\xe1\xbe\x96",
|
||
"\xe1\xbe\x9f" => "\xe1\xbe\x97",
|
||
"\xe1\xbe\xa9" => "\xe1\xbe\xa1",
|
||
"\xe1\xbe\xaa" => "\xe1\xbe\xa2",
|
||
"\xe1\xbe\xab" => "\xe1\xbe\xa3",
|
||
"\xe1\xbe\xac" => "\xe1\xbe\xa4",
|
||
"\xe1\xbe\xad" => "\xe1\xbe\xa5",
|
||
"\xe1\xbe\xae" => "\xe1\xbe\xa6",
|
||
"\xe1\xbe\xaf" => "\xe1\xbe\xa7",
|
||
"\xe1\xbe\xb8" => "\xe1\xbe\xb0",
|
||
"\xe1\xbe\xb9" => "\xe1\xbe\xb1",
|
||
"\xe1\xbe\xba" => "\xe1\xbd\xb0",
|
||
"\xe1\xbe\xbb" => "\xe1\xbd\xb1",
|
||
"\xe1\xbe\xbc" => "\xe1\xbe\xb3",
|
||
"\xe1\xbf\x88" => "\xe1\xbd\xb2",
|
||
"\xe1\xbf\x89" => "\xe1\xbd\xb3",
|
||
"\xe1\xbf\x8a" => "\xe1\xbd\xb4",
|
||
"\xe1\xbf\x8b" => "\xe1\xbd\xb5",
|
||
"\xe1\xbf\x8c" => "\xe1\xbf\x83",
|
||
"\xe1\xbf\x98" => "\xe1\xbf\x90",
|
||
"\xe1\xbf\x99" => "\xe1\xbf\x91",
|
||
"\xe1\xbf\x9a" => "\xe1\xbd\xb6",
|
||
"\xe1\xbf\x9b" => "\xe1\xbd\xb7",
|
||
"\xe1\xbf\xa9" => "\xe1\xbf\xa1",
|
||
"\xe1\xbf\xaa" => "\xe1\xbd\xba",
|
||
"\xe1\xbf\xab" => "\xe1\xbd\xbb",
|
||
"\xe1\xbf\xac" => "\xe1\xbf\xa5",
|
||
"\xe1\xbf\xb8" => "\xe1\xbd\xb8",
|
||
"\xe1\xbf\xb9" => "\xe1\xbd\xb9",
|
||
"\xe1\xbf\xba" => "\xe1\xbd\xbc",
|
||
"\xe1\xbf\xbb" => "\xe1\xbd\xbd",
|
||
"\xe1\xbf\xbc" => "\xe1\xbf\xb3",
|
||
"\xef\xbc\xa1" => "\xef\xbd\x81",
|
||
"\xef\xbc\xa2" => "\xef\xbd\x82",
|
||
"\xef\xbc\xa3" => "\xef\xbd\x83",
|
||
"\xef\xbc\xa4" => "\xef\xbd\x84",
|
||
"\xef\xbc\xa5" => "\xef\xbd\x85",
|
||
"\xef\xbc\xa6" => "\xef\xbd\x86",
|
||
"\xef\xbc\xa7" => "\xef\xbd\x87",
|
||
"\xef\xbc\xa8" => "\xef\xbd\x88",
|
||
"\xef\xbc\xa9" => "\xef\xbd\x89",
|
||
"\xef\xbc\xaa" => "\xef\xbd\x8a",
|
||
"\xef\xbc\xab" => "\xef\xbd\x8b",
|
||
"\xef\xbc\xac" => "\xef\xbd\x8c",
|
||
"\xef\xbc\xad" => "\xef\xbd\x8d",
|
||
"\xef\xbc\xae" => "\xef\xbd\x8e",
|
||
"\xef\xbc\xaf" => "\xef\xbd\x8f",
|
||
"\xef\xbc\xb0" => "\xef\xbd\x90",
|
||
"\xef\xbc\xb1" => "\xef\xbd\x91",
|
||
"\xef\xbc\xb2" => "\xef\xbd\x92",
|
||
"\xef\xbc\xb3" => "\xef\xbd\x93",
|
||
"\xef\xbc\xb4" => "\xef\xbd\x94",
|
||
"\xef\xbc\xb5" => "\xef\xbd\x95",
|
||
"\xef\xbc\xb6" => "\xef\xbd\x96",
|
||
"\xef\xbc\xb7" => "\xef\xbd\x97",
|
||
"\xef\xbc\xb8" => "\xef\xbd\x98",
|
||
"\xef\xbc\xb9" => "\xef\xbd\x99",
|
||
"\xef\xbc\xba" => "\xef\xbd\x9a",
|
||
);
|
||
|
||
/**
|
||
* Unicode Character Database 6.0.0 (2010-06-04)
|
||
* Autogenerated by unicode_blocks_txt2php() PHP function at 2011-06-04 00:19:39, 209 blocks total
|
||
*
|
||
* @var array
|
||
*/
|
||
public static $unicode_blocks = array(
|
||
'Basic Latin' => array(
|
||
0 => 0x0000,
|
||
1 => 0x007F,
|
||
2 => 0,
|
||
),
|
||
'Latin-1 Supplement' => array(
|
||
0 => 0x0080,
|
||
1 => 0x00FF,
|
||
2 => 1,
|
||
),
|
||
'Latin Extended-A' => array(
|
||
0 => 0x0100,
|
||
1 => 0x017F,
|
||
2 => 2,
|
||
),
|
||
'Latin Extended-B' => array(
|
||
0 => 0x0180,
|
||
1 => 0x024F,
|
||
2 => 3,
|
||
),
|
||
'IPA Extensions' => array(
|
||
0 => 0x0250,
|
||
1 => 0x02AF,
|
||
2 => 4,
|
||
),
|
||
'Spacing Modifier Letters' => array(
|
||
0 => 0x02B0,
|
||
1 => 0x02FF,
|
||
2 => 5,
|
||
),
|
||
'Combining Diacritical Marks' => array(
|
||
0 => 0x0300,
|
||
1 => 0x036F,
|
||
2 => 6,
|
||
),
|
||
'Greek and Coptic' => array(
|
||
0 => 0x0370,
|
||
1 => 0x03FF,
|
||
2 => 7,
|
||
),
|
||
'Cyrillic' => array(
|
||
0 => 0x0400,
|
||
1 => 0x04FF,
|
||
2 => 8,
|
||
),
|
||
'Cyrillic Supplement' => array(
|
||
0 => 0x0500,
|
||
1 => 0x052F,
|
||
2 => 9,
|
||
),
|
||
'Armenian' => array(
|
||
0 => 0x0530,
|
||
1 => 0x058F,
|
||
2 => 10,
|
||
),
|
||
'Hebrew' => array(
|
||
0 => 0x0590,
|
||
1 => 0x05FF,
|
||
2 => 11,
|
||
),
|
||
'Arabic' => array(
|
||
0 => 0x0600,
|
||
1 => 0x06FF,
|
||
2 => 12,
|
||
),
|
||
'Syriac' => array(
|
||
0 => 0x0700,
|
||
1 => 0x074F,
|
||
2 => 13,
|
||
),
|
||
'Arabic Supplement' => array(
|
||
0 => 0x0750,
|
||
1 => 0x077F,
|
||
2 => 14,
|
||
),
|
||
'Thaana' => array(
|
||
0 => 0x0780,
|
||
1 => 0x07BF,
|
||
2 => 15,
|
||
),
|
||
'NKo' => array(
|
||
0 => 0x07C0,
|
||
1 => 0x07FF,
|
||
2 => 16,
|
||
),
|
||
'Samaritan' => array(
|
||
0 => 0x0800,
|
||
1 => 0x083F,
|
||
2 => 17,
|
||
),
|
||
'Mandaic' => array(
|
||
0 => 0x0840,
|
||
1 => 0x085F,
|
||
2 => 18,
|
||
),
|
||
'Devanagari' => array(
|
||
0 => 0x0900,
|
||
1 => 0x097F,
|
||
2 => 19,
|
||
),
|
||
'Bengali' => array(
|
||
0 => 0x0980,
|
||
1 => 0x09FF,
|
||
2 => 20,
|
||
),
|
||
'Gurmukhi' => array(
|
||
0 => 0x0A00,
|
||
1 => 0x0A7F,
|
||
2 => 21,
|
||
),
|
||
'Gujarati' => array(
|
||
0 => 0x0A80,
|
||
1 => 0x0AFF,
|
||
2 => 22,
|
||
),
|
||
'Oriya' => array(
|
||
0 => 0x0B00,
|
||
1 => 0x0B7F,
|
||
2 => 23,
|
||
),
|
||
'Tamil' => array(
|
||
0 => 0x0B80,
|
||
1 => 0x0BFF,
|
||
2 => 24,
|
||
),
|
||
'Telugu' => array(
|
||
0 => 0x0C00,
|
||
1 => 0x0C7F,
|
||
2 => 25,
|
||
),
|
||
'Kannada' => array(
|
||
0 => 0x0C80,
|
||
1 => 0x0CFF,
|
||
2 => 26,
|
||
),
|
||
'Malayalam' => array(
|
||
0 => 0x0D00,
|
||
1 => 0x0D7F,
|
||
2 => 27,
|
||
),
|
||
'Sinhala' => array(
|
||
0 => 0x0D80,
|
||
1 => 0x0DFF,
|
||
2 => 28,
|
||
),
|
||
'Thai' => array(
|
||
0 => 0x0E00,
|
||
1 => 0x0E7F,
|
||
2 => 29,
|
||
),
|
||
'Lao' => array(
|
||
0 => 0x0E80,
|
||
1 => 0x0EFF,
|
||
2 => 30,
|
||
),
|
||
'Tibetan' => array(
|
||
0 => 0x0F00,
|
||
1 => 0x0FFF,
|
||
2 => 31,
|
||
),
|
||
'Myanmar' => array(
|
||
0 => 0x1000,
|
||
1 => 0x109F,
|
||
2 => 32,
|
||
),
|
||
'Georgian' => array(
|
||
0 => 0x10A0,
|
||
1 => 0x10FF,
|
||
2 => 33,
|
||
),
|
||
'Hangul Jamo' => array(
|
||
0 => 0x1100,
|
||
1 => 0x11FF,
|
||
2 => 34,
|
||
),
|
||
'Ethiopic' => array(
|
||
0 => 0x1200,
|
||
1 => 0x137F,
|
||
2 => 35,
|
||
),
|
||
'Ethiopic Supplement' => array(
|
||
0 => 0x1380,
|
||
1 => 0x139F,
|
||
2 => 36,
|
||
),
|
||
'Cherokee' => array(
|
||
0 => 0x13A0,
|
||
1 => 0x13FF,
|
||
2 => 37,
|
||
),
|
||
'Unified Canadian Aboriginal Syllabics' => array(
|
||
0 => 0x1400,
|
||
1 => 0x167F,
|
||
2 => 38,
|
||
),
|
||
'Ogham' => array(
|
||
0 => 0x1680,
|
||
1 => 0x169F,
|
||
2 => 39,
|
||
),
|
||
'Runic' => array(
|
||
0 => 0x16A0,
|
||
1 => 0x16FF,
|
||
2 => 40,
|
||
),
|
||
'Tagalog' => array(
|
||
0 => 0x1700,
|
||
1 => 0x171F,
|
||
2 => 41,
|
||
),
|
||
'Hanunoo' => array(
|
||
0 => 0x1720,
|
||
1 => 0x173F,
|
||
2 => 42,
|
||
),
|
||
'Buhid' => array(
|
||
0 => 0x1740,
|
||
1 => 0x175F,
|
||
2 => 43,
|
||
),
|
||
'Tagbanwa' => array(
|
||
0 => 0x1760,
|
||
1 => 0x177F,
|
||
2 => 44,
|
||
),
|
||
'Khmer' => array(
|
||
0 => 0x1780,
|
||
1 => 0x17FF,
|
||
2 => 45,
|
||
),
|
||
'Mongolian' => array(
|
||
0 => 0x1800,
|
||
1 => 0x18AF,
|
||
2 => 46,
|
||
),
|
||
'Unified Canadian Aboriginal Syllabics Extended' => array(
|
||
0 => 0x18B0,
|
||
1 => 0x18FF,
|
||
2 => 47,
|
||
),
|
||
'Limbu' => array(
|
||
0 => 0x1900,
|
||
1 => 0x194F,
|
||
2 => 48,
|
||
),
|
||
'Tai Le' => array(
|
||
0 => 0x1950,
|
||
1 => 0x197F,
|
||
2 => 49,
|
||
),
|
||
'New Tai Lue' => array(
|
||
0 => 0x1980,
|
||
1 => 0x19DF,
|
||
2 => 50,
|
||
),
|
||
'Khmer Symbols' => array(
|
||
0 => 0x19E0,
|
||
1 => 0x19FF,
|
||
2 => 51,
|
||
),
|
||
'Buginese' => array(
|
||
0 => 0x1A00,
|
||
1 => 0x1A1F,
|
||
2 => 52,
|
||
),
|
||
'Tai Tham' => array(
|
||
0 => 0x1A20,
|
||
1 => 0x1AAF,
|
||
2 => 53,
|
||
),
|
||
'Balinese' => array(
|
||
0 => 0x1B00,
|
||
1 => 0x1B7F,
|
||
2 => 54,
|
||
),
|
||
'Sundanese' => array(
|
||
0 => 0x1B80,
|
||
1 => 0x1BBF,
|
||
2 => 55,
|
||
),
|
||
'Batak' => array(
|
||
0 => 0x1BC0,
|
||
1 => 0x1BFF,
|
||
2 => 56,
|
||
),
|
||
'Lepcha' => array(
|
||
0 => 0x1C00,
|
||
1 => 0x1C4F,
|
||
2 => 57,
|
||
),
|
||
'Ol Chiki' => array(
|
||
0 => 0x1C50,
|
||
1 => 0x1C7F,
|
||
2 => 58,
|
||
),
|
||
'Vedic Extensions' => array(
|
||
0 => 0x1CD0,
|
||
1 => 0x1CFF,
|
||
2 => 59,
|
||
),
|
||
'Phonetic Extensions' => array(
|
||
0 => 0x1D00,
|
||
1 => 0x1D7F,
|
||
2 => 60,
|
||
),
|
||
'Phonetic Extensions Supplement' => array(
|
||
0 => 0x1D80,
|
||
1 => 0x1DBF,
|
||
2 => 61,
|
||
),
|
||
'Combining Diacritical Marks Supplement' => array(
|
||
0 => 0x1DC0,
|
||
1 => 0x1DFF,
|
||
2 => 62,
|
||
),
|
||
'Latin Extended Additional' => array(
|
||
0 => 0x1E00,
|
||
1 => 0x1EFF,
|
||
2 => 63,
|
||
),
|
||
'Greek Extended' => array(
|
||
0 => 0x1F00,
|
||
1 => 0x1FFF,
|
||
2 => 64,
|
||
),
|
||
'General Punctuation' => array(
|
||
0 => 0x2000,
|
||
1 => 0x206F,
|
||
2 => 65,
|
||
),
|
||
'Superscripts and Subscripts' => array(
|
||
0 => 0x2070,
|
||
1 => 0x209F,
|
||
2 => 66,
|
||
),
|
||
'Currency Symbols' => array(
|
||
0 => 0x20A0,
|
||
1 => 0x20CF,
|
||
2 => 67,
|
||
),
|
||
'Combining Diacritical Marks for Symbols' => array(
|
||
0 => 0x20D0,
|
||
1 => 0x20FF,
|
||
2 => 68,
|
||
),
|
||
'Letterlike Symbols' => array(
|
||
0 => 0x2100,
|
||
1 => 0x214F,
|
||
2 => 69,
|
||
),
|
||
'Number Forms' => array(
|
||
0 => 0x2150,
|
||
1 => 0x218F,
|
||
2 => 70,
|
||
),
|
||
'Arrows' => array(
|
||
0 => 0x2190,
|
||
1 => 0x21FF,
|
||
2 => 71,
|
||
),
|
||
'Mathematical Operators' => array(
|
||
0 => 0x2200,
|
||
1 => 0x22FF,
|
||
2 => 72,
|
||
),
|
||
'Miscellaneous Technical' => array(
|
||
0 => 0x2300,
|
||
1 => 0x23FF,
|
||
2 => 73,
|
||
),
|
||
'Control Pictures' => array(
|
||
0 => 0x2400,
|
||
1 => 0x243F,
|
||
2 => 74,
|
||
),
|
||
'Optical Character Recognition' => array(
|
||
0 => 0x2440,
|
||
1 => 0x245F,
|
||
2 => 75,
|
||
),
|
||
'Enclosed Alphanumerics' => array(
|
||
0 => 0x2460,
|
||
1 => 0x24FF,
|
||
2 => 76,
|
||
),
|
||
'Box Drawing' => array(
|
||
0 => 0x2500,
|
||
1 => 0x257F,
|
||
2 => 77,
|
||
),
|
||
'Block Elements' => array(
|
||
0 => 0x2580,
|
||
1 => 0x259F,
|
||
2 => 78,
|
||
),
|
||
'Geometric Shapes' => array(
|
||
0 => 0x25A0,
|
||
1 => 0x25FF,
|
||
2 => 79,
|
||
),
|
||
'Miscellaneous Symbols' => array(
|
||
0 => 0x2600,
|
||
1 => 0x26FF,
|
||
2 => 80,
|
||
),
|
||
'Dingbats' => array(
|
||
0 => 0x2700,
|
||
1 => 0x27BF,
|
||
2 => 81,
|
||
),
|
||
'Miscellaneous Mathematical Symbols-A' => array(
|
||
0 => 0x27C0,
|
||
1 => 0x27EF,
|
||
2 => 82,
|
||
),
|
||
'Supplemental Arrows-A' => array(
|
||
0 => 0x27F0,
|
||
1 => 0x27FF,
|
||
2 => 83,
|
||
),
|
||
'Braille Patterns' => array(
|
||
0 => 0x2800,
|
||
1 => 0x28FF,
|
||
2 => 84,
|
||
),
|
||
'Supplemental Arrows-B' => array(
|
||
0 => 0x2900,
|
||
1 => 0x297F,
|
||
2 => 85,
|
||
),
|
||
'Miscellaneous Mathematical Symbols-B' => array(
|
||
0 => 0x2980,
|
||
1 => 0x29FF,
|
||
2 => 86,
|
||
),
|
||
'Supplemental Mathematical Operators' => array(
|
||
0 => 0x2A00,
|
||
1 => 0x2AFF,
|
||
2 => 87,
|
||
),
|
||
'Miscellaneous Symbols and Arrows' => array(
|
||
0 => 0x2B00,
|
||
1 => 0x2BFF,
|
||
2 => 88,
|
||
),
|
||
'Glagolitic' => array(
|
||
0 => 0x2C00,
|
||
1 => 0x2C5F,
|
||
2 => 89,
|
||
),
|
||
'Latin Extended-C' => array(
|
||
0 => 0x2C60,
|
||
1 => 0x2C7F,
|
||
2 => 90,
|
||
),
|
||
'Coptic' => array(
|
||
0 => 0x2C80,
|
||
1 => 0x2CFF,
|
||
2 => 91,
|
||
),
|
||
'Georgian Supplement' => array(
|
||
0 => 0x2D00,
|
||
1 => 0x2D2F,
|
||
2 => 92,
|
||
),
|
||
'Tifinagh' => array(
|
||
0 => 0x2D30,
|
||
1 => 0x2D7F,
|
||
2 => 93,
|
||
),
|
||
'Ethiopic Extended' => array(
|
||
0 => 0x2D80,
|
||
1 => 0x2DDF,
|
||
2 => 94,
|
||
),
|
||
'Cyrillic Extended-A' => array(
|
||
0 => 0x2DE0,
|
||
1 => 0x2DFF,
|
||
2 => 95,
|
||
),
|
||
'Supplemental Punctuation' => array(
|
||
0 => 0x2E00,
|
||
1 => 0x2E7F,
|
||
2 => 96,
|
||
),
|
||
'CJK Radicals Supplement' => array(
|
||
0 => 0x2E80,
|
||
1 => 0x2EFF,
|
||
2 => 97,
|
||
),
|
||
'Kangxi Radicals' => array(
|
||
0 => 0x2F00,
|
||
1 => 0x2FDF,
|
||
2 => 98,
|
||
),
|
||
'Ideographic Description Characters' => array(
|
||
0 => 0x2FF0,
|
||
1 => 0x2FFF,
|
||
2 => 99,
|
||
),
|
||
'CJK Symbols and Punctuation' => array(
|
||
0 => 0x3000,
|
||
1 => 0x303F,
|
||
2 => 100,
|
||
),
|
||
'Hiragana' => array(
|
||
0 => 0x3040,
|
||
1 => 0x309F,
|
||
2 => 101,
|
||
),
|
||
'Katakana' => array(
|
||
0 => 0x30A0,
|
||
1 => 0x30FF,
|
||
2 => 102,
|
||
),
|
||
'Bopomofo' => array(
|
||
0 => 0x3100,
|
||
1 => 0x312F,
|
||
2 => 103,
|
||
),
|
||
'Hangul Compatibility Jamo' => array(
|
||
0 => 0x3130,
|
||
1 => 0x318F,
|
||
2 => 104,
|
||
),
|
||
'Kanbun' => array(
|
||
0 => 0x3190,
|
||
1 => 0x319F,
|
||
2 => 105,
|
||
),
|
||
'Bopomofo Extended' => array(
|
||
0 => 0x31A0,
|
||
1 => 0x31BF,
|
||
2 => 106,
|
||
),
|
||
'CJK Strokes' => array(
|
||
0 => 0x31C0,
|
||
1 => 0x31EF,
|
||
2 => 107,
|
||
),
|
||
'Katakana Phonetic Extensions' => array(
|
||
0 => 0x31F0,
|
||
1 => 0x31FF,
|
||
2 => 108,
|
||
),
|
||
'Enclosed CJK Letters and Months' => array(
|
||
0 => 0x3200,
|
||
1 => 0x32FF,
|
||
2 => 109,
|
||
),
|
||
'CJK Compatibility' => array(
|
||
0 => 0x3300,
|
||
1 => 0x33FF,
|
||
2 => 110,
|
||
),
|
||
'CJK Unified Ideographs Extension A' => array(
|
||
0 => 0x3400,
|
||
1 => 0x4DBF,
|
||
2 => 111,
|
||
),
|
||
'Yijing Hexagram Symbols' => array(
|
||
0 => 0x4DC0,
|
||
1 => 0x4DFF,
|
||
2 => 112,
|
||
),
|
||
'CJK Unified Ideographs' => array(
|
||
0 => 0x4E00,
|
||
1 => 0x9FFF,
|
||
2 => 113,
|
||
),
|
||
'Yi Syllables' => array(
|
||
0 => 0xA000,
|
||
1 => 0xA48F,
|
||
2 => 114,
|
||
),
|
||
'Yi Radicals' => array(
|
||
0 => 0xA490,
|
||
1 => 0xA4CF,
|
||
2 => 115,
|
||
),
|
||
'Lisu' => array(
|
||
0 => 0xA4D0,
|
||
1 => 0xA4FF,
|
||
2 => 116,
|
||
),
|
||
'Vai' => array(
|
||
0 => 0xA500,
|
||
1 => 0xA63F,
|
||
2 => 117,
|
||
),
|
||
'Cyrillic Extended-B' => array(
|
||
0 => 0xA640,
|
||
1 => 0xA69F,
|
||
2 => 118,
|
||
),
|
||
'Bamum' => array(
|
||
0 => 0xA6A0,
|
||
1 => 0xA6FF,
|
||
2 => 119,
|
||
),
|
||
'Modifier Tone Letters' => array(
|
||
0 => 0xA700,
|
||
1 => 0xA71F,
|
||
2 => 120,
|
||
),
|
||
'Latin Extended-D' => array(
|
||
0 => 0xA720,
|
||
1 => 0xA7FF,
|
||
2 => 121,
|
||
),
|
||
'Syloti Nagri' => array(
|
||
0 => 0xA800,
|
||
1 => 0xA82F,
|
||
2 => 122,
|
||
),
|
||
'Common Indic Number Forms' => array(
|
||
0 => 0xA830,
|
||
1 => 0xA83F,
|
||
2 => 123,
|
||
),
|
||
'Phags-pa' => array(
|
||
0 => 0xA840,
|
||
1 => 0xA87F,
|
||
2 => 124,
|
||
),
|
||
'Saurashtra' => array(
|
||
0 => 0xA880,
|
||
1 => 0xA8DF,
|
||
2 => 125,
|
||
),
|
||
'Devanagari Extended' => array(
|
||
0 => 0xA8E0,
|
||
1 => 0xA8FF,
|
||
2 => 126,
|
||
),
|
||
'Kayah Li' => array(
|
||
0 => 0xA900,
|
||
1 => 0xA92F,
|
||
2 => 127,
|
||
),
|
||
'Rejang' => array(
|
||
0 => 0xA930,
|
||
1 => 0xA95F,
|
||
2 => 128,
|
||
),
|
||
'Hangul Jamo Extended-A' => array(
|
||
0 => 0xA960,
|
||
1 => 0xA97F,
|
||
2 => 129,
|
||
),
|
||
'Javanese' => array(
|
||
0 => 0xA980,
|
||
1 => 0xA9DF,
|
||
2 => 130,
|
||
),
|
||
'Cham' => array(
|
||
0 => 0xAA00,
|
||
1 => 0xAA5F,
|
||
2 => 131,
|
||
),
|
||
'Myanmar Extended-A' => array(
|
||
0 => 0xAA60,
|
||
1 => 0xAA7F,
|
||
2 => 132,
|
||
),
|
||
'Tai Viet' => array(
|
||
0 => 0xAA80,
|
||
1 => 0xAADF,
|
||
2 => 133,
|
||
),
|
||
'Ethiopic Extended-A' => array(
|
||
0 => 0xAB00,
|
||
1 => 0xAB2F,
|
||
2 => 134,
|
||
),
|
||
'Meetei Mayek' => array(
|
||
0 => 0xABC0,
|
||
1 => 0xABFF,
|
||
2 => 135,
|
||
),
|
||
'Hangul Syllables' => array(
|
||
0 => 0xAC00,
|
||
1 => 0xD7AF,
|
||
2 => 136,
|
||
),
|
||
'Hangul Jamo Extended-B' => array(
|
||
0 => 0xD7B0,
|
||
1 => 0xD7FF,
|
||
2 => 137,
|
||
),
|
||
'High Surrogates' => array(
|
||
0 => 0xD800,
|
||
1 => 0xDB7F,
|
||
2 => 138,
|
||
),
|
||
'High Private Use Surrogates' => array(
|
||
0 => 0xDB80,
|
||
1 => 0xDBFF,
|
||
2 => 139,
|
||
),
|
||
'Low Surrogates' => array(
|
||
0 => 0xDC00,
|
||
1 => 0xDFFF,
|
||
2 => 140,
|
||
),
|
||
'Private Use Area' => array(
|
||
0 => 0xE000,
|
||
1 => 0xF8FF,
|
||
2 => 141,
|
||
),
|
||
'CJK Compatibility Ideographs' => array(
|
||
0 => 0xF900,
|
||
1 => 0xFAFF,
|
||
2 => 142,
|
||
),
|
||
'Alphabetic Presentation Forms' => array(
|
||
0 => 0xFB00,
|
||
1 => 0xFB4F,
|
||
2 => 143,
|
||
),
|
||
'Arabic Presentation Forms-A' => array(
|
||
0 => 0xFB50,
|
||
1 => 0xFDFF,
|
||
2 => 144,
|
||
),
|
||
'Variation Selectors' => array(
|
||
0 => 0xFE00,
|
||
1 => 0xFE0F,
|
||
2 => 145,
|
||
),
|
||
'Vertical Forms' => array(
|
||
0 => 0xFE10,
|
||
1 => 0xFE1F,
|
||
2 => 146,
|
||
),
|
||
'Combining Half Marks' => array(
|
||
0 => 0xFE20,
|
||
1 => 0xFE2F,
|
||
2 => 147,
|
||
),
|
||
'CJK Compatibility Forms' => array(
|
||
0 => 0xFE30,
|
||
1 => 0xFE4F,
|
||
2 => 148,
|
||
),
|
||
'Small Form Variants' => array(
|
||
0 => 0xFE50,
|
||
1 => 0xFE6F,
|
||
2 => 149,
|
||
),
|
||
'Arabic Presentation Forms-B' => array(
|
||
0 => 0xFE70,
|
||
1 => 0xFEFF,
|
||
2 => 150,
|
||
),
|
||
'Halfwidth and Fullwidth Forms' => array(
|
||
0 => 0xFF00,
|
||
1 => 0xFFEF,
|
||
2 => 151,
|
||
),
|
||
'Specials' => array(
|
||
0 => 0xFFF0,
|
||
1 => 0xFFFF,
|
||
2 => 152,
|
||
),
|
||
'Linear B Syllabary' => array(
|
||
0 => 0x10000,
|
||
1 => 0x1007F,
|
||
2 => 153,
|
||
),
|
||
'Linear B Ideograms' => array(
|
||
0 => 0x10080,
|
||
1 => 0x100FF,
|
||
2 => 154,
|
||
),
|
||
'Aegean Numbers' => array(
|
||
0 => 0x10100,
|
||
1 => 0x1013F,
|
||
2 => 155,
|
||
),
|
||
'Ancient Greek Numbers' => array(
|
||
0 => 0x10140,
|
||
1 => 0x1018F,
|
||
2 => 156,
|
||
),
|
||
'Ancient Symbols' => array(
|
||
0 => 0x10190,
|
||
1 => 0x101CF,
|
||
2 => 157,
|
||
),
|
||
'Phaistos Disc' => array(
|
||
0 => 0x101D0,
|
||
1 => 0x101FF,
|
||
2 => 158,
|
||
),
|
||
'Lycian' => array(
|
||
0 => 0x10280,
|
||
1 => 0x1029F,
|
||
2 => 159,
|
||
),
|
||
'Carian' => array(
|
||
0 => 0x102A0,
|
||
1 => 0x102DF,
|
||
2 => 160,
|
||
),
|
||
'Old Italic' => array(
|
||
0 => 0x10300,
|
||
1 => 0x1032F,
|
||
2 => 161,
|
||
),
|
||
'Gothic' => array(
|
||
0 => 0x10330,
|
||
1 => 0x1034F,
|
||
2 => 162,
|
||
),
|
||
'Ugaritic' => array(
|
||
0 => 0x10380,
|
||
1 => 0x1039F,
|
||
2 => 163,
|
||
),
|
||
'Old Persian' => array(
|
||
0 => 0x103A0,
|
||
1 => 0x103DF,
|
||
2 => 164,
|
||
),
|
||
'Deseret' => array(
|
||
0 => 0x10400,
|
||
1 => 0x1044F,
|
||
2 => 165,
|
||
),
|
||
'Shavian' => array(
|
||
0 => 0x10450,
|
||
1 => 0x1047F,
|
||
2 => 166,
|
||
),
|
||
'Osmanya' => array(
|
||
0 => 0x10480,
|
||
1 => 0x104AF,
|
||
2 => 167,
|
||
),
|
||
'Cypriot Syllabary' => array(
|
||
0 => 0x10800,
|
||
1 => 0x1083F,
|
||
2 => 168,
|
||
),
|
||
'Imperial Aramaic' => array(
|
||
0 => 0x10840,
|
||
1 => 0x1085F,
|
||
2 => 169,
|
||
),
|
||
'Phoenician' => array(
|
||
0 => 0x10900,
|
||
1 => 0x1091F,
|
||
2 => 170,
|
||
),
|
||
'Lydian' => array(
|
||
0 => 0x10920,
|
||
1 => 0x1093F,
|
||
2 => 171,
|
||
),
|
||
'Kharoshthi' => array(
|
||
0 => 0x10A00,
|
||
1 => 0x10A5F,
|
||
2 => 172,
|
||
),
|
||
'Old South Arabian' => array(
|
||
0 => 0x10A60,
|
||
1 => 0x10A7F,
|
||
2 => 173,
|
||
),
|
||
'Avestan' => array(
|
||
0 => 0x10B00,
|
||
1 => 0x10B3F,
|
||
2 => 174,
|
||
),
|
||
'Inscriptional Parthian' => array(
|
||
0 => 0x10B40,
|
||
1 => 0x10B5F,
|
||
2 => 175,
|
||
),
|
||
'Inscriptional Pahlavi' => array(
|
||
0 => 0x10B60,
|
||
1 => 0x10B7F,
|
||
2 => 176,
|
||
),
|
||
'Old Turkic' => array(
|
||
0 => 0x10C00,
|
||
1 => 0x10C4F,
|
||
2 => 177,
|
||
),
|
||
'Rumi Numeral Symbols' => array(
|
||
0 => 0x10E60,
|
||
1 => 0x10E7F,
|
||
2 => 178,
|
||
),
|
||
'Brahmi' => array(
|
||
0 => 0x11000,
|
||
1 => 0x1107F,
|
||
2 => 179,
|
||
),
|
||
'Kaithi' => array(
|
||
0 => 0x11080,
|
||
1 => 0x110CF,
|
||
2 => 180,
|
||
),
|
||
'Cuneiform' => array(
|
||
0 => 0x12000,
|
||
1 => 0x123FF,
|
||
2 => 181,
|
||
),
|
||
'Cuneiform Numbers and Punctuation' => array(
|
||
0 => 0x12400,
|
||
1 => 0x1247F,
|
||
2 => 182,
|
||
),
|
||
'Egyptian Hieroglyphs' => array(
|
||
0 => 0x13000,
|
||
1 => 0x1342F,
|
||
2 => 183,
|
||
),
|
||
'Bamum Supplement' => array(
|
||
0 => 0x16800,
|
||
1 => 0x16A3F,
|
||
2 => 184,
|
||
),
|
||
'Kana Supplement' => array(
|
||
0 => 0x1B000,
|
||
1 => 0x1B0FF,
|
||
2 => 185,
|
||
),
|
||
'Byzantine Musical Symbols' => array(
|
||
0 => 0x1D000,
|
||
1 => 0x1D0FF,
|
||
2 => 186,
|
||
),
|
||
'Musical Symbols' => array(
|
||
0 => 0x1D100,
|
||
1 => 0x1D1FF,
|
||
2 => 187,
|
||
),
|
||
'Ancient Greek Musical Notation' => array(
|
||
0 => 0x1D200,
|
||
1 => 0x1D24F,
|
||
2 => 188,
|
||
),
|
||
'Tai Xuan Jing Symbols' => array(
|
||
0 => 0x1D300,
|
||
1 => 0x1D35F,
|
||
2 => 189,
|
||
),
|
||
'Counting Rod Numerals' => array(
|
||
0 => 0x1D360,
|
||
1 => 0x1D37F,
|
||
2 => 190,
|
||
),
|
||
'Mathematical Alphanumeric Symbols' => array(
|
||
0 => 0x1D400,
|
||
1 => 0x1D7FF,
|
||
2 => 191,
|
||
),
|
||
'Mahjong Tiles' => array(
|
||
0 => 0x1F000,
|
||
1 => 0x1F02F,
|
||
2 => 192,
|
||
),
|
||
'Domino Tiles' => array(
|
||
0 => 0x1F030,
|
||
1 => 0x1F09F,
|
||
2 => 193,
|
||
),
|
||
'Playing Cards' => array(
|
||
0 => 0x1F0A0,
|
||
1 => 0x1F0FF,
|
||
2 => 194,
|
||
),
|
||
'Enclosed Alphanumeric Supplement' => array(
|
||
0 => 0x1F100,
|
||
1 => 0x1F1FF,
|
||
2 => 195,
|
||
),
|
||
'Enclosed Ideographic Supplement' => array(
|
||
0 => 0x1F200,
|
||
1 => 0x1F2FF,
|
||
2 => 196,
|
||
),
|
||
'Miscellaneous Symbols And Pictographs' => array(
|
||
0 => 0x1F300,
|
||
1 => 0x1F5FF,
|
||
2 => 197,
|
||
),
|
||
'Emoticons' => array(
|
||
0 => 0x1F600,
|
||
1 => 0x1F64F,
|
||
2 => 198,
|
||
),
|
||
'Transport And Map Symbols' => array(
|
||
0 => 0x1F680,
|
||
1 => 0x1F6FF,
|
||
2 => 199,
|
||
),
|
||
'Alchemical Symbols' => array(
|
||
0 => 0x1F700,
|
||
1 => 0x1F77F,
|
||
2 => 200,
|
||
),
|
||
'CJK Unified Ideographs Extension B' => array(
|
||
0 => 0x20000,
|
||
1 => 0x2A6DF,
|
||
2 => 201,
|
||
),
|
||
'CJK Unified Ideographs Extension C' => array(
|
||
0 => 0x2A700,
|
||
1 => 0x2B73F,
|
||
2 => 202,
|
||
),
|
||
'CJK Unified Ideographs Extension D' => array(
|
||
0 => 0x2B740,
|
||
1 => 0x2B81F,
|
||
2 => 203,
|
||
),
|
||
'CJK Compatibility Ideographs Supplement' => array(
|
||
0 => 0x2F800,
|
||
1 => 0x2FA1F,
|
||
2 => 204,
|
||
),
|
||
'Tags' => array(
|
||
0 => 0xE0000,
|
||
1 => 0xE007F,
|
||
2 => 205,
|
||
),
|
||
'Variation Selectors Supplement' => array(
|
||
0 => 0xE0100,
|
||
1 => 0xE01EF,
|
||
2 => 206,
|
||
),
|
||
'Supplementary Private Use Area-A' => array(
|
||
0 => 0xF0000,
|
||
1 => 0xFFFFF,
|
||
2 => 207,
|
||
),
|
||
'Supplementary Private Use Area-B' => array(
|
||
0 => 0x100000,
|
||
1 => 0x10FFFF,
|
||
2 => 208,
|
||
),
|
||
);
|
||
|
||
#calling the methods of this class only statically!
|
||
private function __construct() {}
|
||
|
||
/**
|
||
* Remove combining diactrical marks, with possibility of the restore
|
||
* Удаляет диакритические знаки в тексте, с возможностью восстановления (опция)
|
||
*
|
||
* @param string|null $s
|
||
* @param array|null $additional_chars for example: "\xc2\xad" #soft hyphen = discretionary hyphen
|
||
* @param bool $is_can_restored
|
||
* @param array|null &$restore_table
|
||
* @return string|bool|null Returns FALSE if error occurred
|
||
*/
|
||
public static function diactrical_remove($s, $additional_chars = null, $is_can_restored = false, &$restore_table = null)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s) || $s === '') return $s;
|
||
|
||
if ($additional_chars)
|
||
{
|
||
foreach ($additional_chars as $k => &$v) $v = preg_quote($v, '/');
|
||
$re = '/((?>' . self::DIACTRICAL_RE . '|' . implode('|', $additional_chars) . ')+)/sxSX';
|
||
}
|
||
else $re = '/((?>' . self::DIACTRICAL_RE . ')+)/sxSX';
|
||
if (! $is_can_restored) return preg_replace($re, '', $s);
|
||
|
||
$restore_table = array();
|
||
$a = preg_split($re, $s, -1, PREG_SPLIT_DELIM_CAPTURE);
|
||
$c = count($a);
|
||
if ($c === 1) return $s;
|
||
$pos = 0;
|
||
$s2 = '';
|
||
for ($i = 0; $i < $c - 1; $i += 2)
|
||
{
|
||
$s2 .= $a[$i];
|
||
#запоминаем символьные (не байтовые!) позиции
|
||
$pos += self::strlen($a[$i]);
|
||
$restore_table['offsets'][$pos] = $a[$i + 1];
|
||
}
|
||
$restore_table['length'] = $pos + self::strlen(end($a));
|
||
return $s2 . end($a);
|
||
}
|
||
|
||
/**
|
||
* Restore combining diactrical marks, removed by self::diactrical_remove()
|
||
* In Russian:
|
||
* Восстанавливает диакритические знаки в тексте, при условии, что их символьные позиции и кол-во символов не изменились!
|
||
*
|
||
* @see self::diactrical_remove()
|
||
* @param string|null $s
|
||
* @param array $restore_table
|
||
* @return string|bool|null Returns FALSE if error occurred (broken $restore_table)
|
||
*/
|
||
public static function diactrical_restore($s, array $restore_table)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s) || $s === '') return $s;
|
||
|
||
if (! $restore_table) return $s;
|
||
if (! is_int(@$restore_table['length']) ||
|
||
! is_array(@$restore_table['offsets']) ||
|
||
$restore_table['length'] !== self::strlen($s)) return false;
|
||
$a = array();
|
||
$length = $offset = 0;
|
||
$s2 = '';
|
||
foreach ($restore_table['offsets'] as $pos => $diactricals)
|
||
{
|
||
$length = $pos - $offset;
|
||
$s2 .= self::substr($s, $offset, $length) . $diactricals;
|
||
$offset = $pos;
|
||
}
|
||
return $s2 . self::substr($s, $offset, strlen($s));
|
||
}
|
||
|
||
/**
|
||
* Encodes data from another character encoding to UTF-8.
|
||
*
|
||
* @param array|scalar|null $data
|
||
* @param string $charset
|
||
* @return array|scalar|null Returns FALSE if error occurred
|
||
*/
|
||
public static function convert_from($data, $charset = 'cp1251')
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
$charset = strtoupper($charset);
|
||
return self::_convert($data, $charset, 'UTF-8');
|
||
}
|
||
|
||
/**
|
||
* Encodes data from UTF-8 to another character encoding.
|
||
*
|
||
* @param array|scalar|null $data
|
||
* @param string $charset
|
||
* @return array|scalar|null Returns FALSE if error occurred
|
||
*/
|
||
public static function convert_to($data, $charset = 'cp1251')
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
$charset = strtoupper($charset);
|
||
return self::_convert($data, 'UTF-8', $charset);
|
||
}
|
||
|
||
/**
|
||
* Recoding the data of any structure to/from UTF-8.
|
||
* Arrays traversed recursively, recoded keys and values.
|
||
*
|
||
* @see mb_encoding_aliases()
|
||
* @param array|scalar|null $data
|
||
* @param string $charset_from
|
||
* @param string $charset_to
|
||
* @return array|scalar|null Returns FALSE if error occurred
|
||
*/
|
||
private static function _convert($data, $charset_from, $charset_to)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false; #for recursive calls
|
||
if ($charset_from === $charset_to) return $data; #speed improve
|
||
if (is_array($data))
|
||
{
|
||
$d = array();
|
||
foreach ($data as $k => &$v)
|
||
{
|
||
if (is_string($k))
|
||
{
|
||
$k = self::_convert($k, $charset_from, $charset_to);
|
||
if (! is_string($k)) return false;
|
||
}
|
||
$d[$k] = self::_convert($v, $charset_from, $charset_to);
|
||
if ($d[$k] === false && ! is_bool($v)) return false;
|
||
}
|
||
return $d;
|
||
}
|
||
if (is_string($data))
|
||
{
|
||
#smart behaviour for errors protected + speed improve
|
||
if ($charset_from === 'UTF-8' && ! self::is_utf8($data)) return $data;
|
||
if ($charset_to === 'UTF-8' && self::is_utf8($data)) return $data;
|
||
|
||
#since PHP-5.3.x iconv() faster then mb_convert_encoding()
|
||
if (function_exists('iconv')) return iconv($charset_from, $charset_to . '//IGNORE//TRANSLIT', $data);
|
||
if (function_exists('mb_convert_encoding')) return mb_convert_encoding($data, $charset_to, $charset_from);
|
||
|
||
#charset_from
|
||
if ($charset_from === 'ISO-8859-1') return utf8_encode($data);
|
||
if ($charset_from === 'UTF-16' || $charset_from === 'UCS-2') return self::_convert_from_utf16($data);
|
||
if ($charset_from === 'CP1251' || $charset_from === 'CP1259') return strtr($data, self::$cp1259_table);
|
||
if ($charset_from === 'KOI8-R') return strtr(convert_cyr_string($data, 'k', 'w'), self::$cp1259_table);
|
||
if ($charset_from === 'ISO-8859-5') return strtr(convert_cyr_string($data, 'i', 'w'), self::$cp1259_table);
|
||
if ($charset_from === 'CP866') return strtr(convert_cyr_string($data, 'a', 'w'), self::$cp1259_table);
|
||
if ($charset_from === 'MAC-CYRILLIC') return strtr(convert_cyr_string($data, 'm', 'w'), self::$cp1259_table);
|
||
|
||
#charset_to
|
||
if ($charset_to === 'ISO-8859-1') return utf8_decode($data);
|
||
if ($charset_to === 'CP1251' || $charset_to === 'CP1259') return strtr($data, array_flip(self::$cp1259_table));
|
||
|
||
#last trying
|
||
if (function_exists('recode_string'))
|
||
{
|
||
$s = @recode_string($charset_from . '..' . $charset_to, $data);
|
||
if (is_string($s)) return $s;
|
||
}
|
||
|
||
trigger_error('Convert "' . $charset_from . '" --> "' . $charset_to . '" is not supported native, "iconv" or "mbstring" extension required', E_USER_WARNING);
|
||
return false;
|
||
}
|
||
if (is_scalar($data) || is_null($data)) return $data; #~ null, integer, float, boolean
|
||
return false; #object or resource
|
||
}
|
||
|
||
/**
|
||
* Convert UTF-16 / UCS-2 encoding string to UTF-8.
|
||
* Surrogates UTF-16 are supported!
|
||
*
|
||
* In Russian:
|
||
* Преобразует строку из кодировки UTF-16 / UCS-2 в UTF-8.
|
||
* Суррогаты UTF-16 поддерживаются!
|
||
*
|
||
* @param string $s
|
||
* @param string $type 'BE' -- big endian byte order
|
||
* 'LE' -- little endian byte order
|
||
* @param bool $to_array returns array chars instead whole string?
|
||
* @return string|array|bool UTF-8 string, array chars or FALSE if error occurred
|
||
*/
|
||
private static function _convert_from_utf16($s, $type = 'BE', $to_array = false)
|
||
{
|
||
static $types = array(
|
||
'BE' => 'n', #unsigned short (always 16 bit, big endian byte order)
|
||
'LE' => 'v', #unsigned short (always 16 bit, little endian byte order)
|
||
);
|
||
if (! array_key_exists($type, $types))
|
||
{
|
||
trigger_error('Unexpected value in 2-nd parameter, "' . $type . '" given!', E_USER_WARNING);
|
||
return false;
|
||
}
|
||
#the fastest way:
|
||
if (function_exists('iconv') || function_exists('mb_convert_encoding'))
|
||
{
|
||
if (function_exists('iconv')) $s = iconv('UTF-16' . $type, 'UTF-8', $s);
|
||
elseif (function_exists('mb_convert_encoding')) $s = mb_convert_encoding($s, 'UTF-8', 'UTF-16' . $type);
|
||
if (! $to_array) return $s;
|
||
return self::str_split($s);
|
||
}
|
||
|
||
/*
|
||
http://en.wikipedia.org/wiki/UTF-16
|
||
|
||
The improvement that UTF-16 made over UCS-2 is its ability to encode
|
||
characters in planes 1-16, not just those in plane 0 (BMP).
|
||
|
||
UTF-16 represents non-BMP characters (those from U+10000 through U+10FFFF)
|
||
using a pair of 16-bit words, known as a surrogate pair.
|
||
First 1000016 is subtracted from the code point to give a 20-bit value.
|
||
This is then split into two separate 10-bit values each of which is represented
|
||
as a surrogate with the most significant half placed in the first surrogate.
|
||
To allow safe use of simple word-oriented string processing, separate ranges
|
||
of values are used for the two surrogates: 0xD800-0xDBFF for the first, most
|
||
significant surrogate and 0xDC00-0xDFFF for the second, least significant surrogate.
|
||
|
||
For example, the character at code point U+10000 becomes the code unit sequence 0xD800 0xDC00,
|
||
and the character at U+10FFFD, the upper limit of Unicode, becomes the sequence 0xDBFF 0xDFFD.
|
||
Unicode and ISO/IEC 10646 do not, and will never, assign characters to any of the code points
|
||
in the U+D800-U+DFFF range, so an individual code value from a surrogate pair does not ever
|
||
represent a character.
|
||
|
||
http://www.russellcottrell.com/greek/utilities/SurrogatePairCalculator.htm
|
||
http://www.russellcottrell.com/greek/utilities/UnicodeRanges.htm
|
||
|
||
Conversion of a Unicode scalar value S to a surrogate pair <H, L>:
|
||
H = Math.floor((S - 0x10000) / 0x400) + 0xD800;
|
||
L = ((S - 0x10000) % 0x400) + 0xDC00;
|
||
The conversion of a surrogate pair <H, L> to a scalar value:
|
||
N = ((H - 0xD800) * 0x400) + (L - 0xDC00) + 0x10000;
|
||
*/
|
||
$a = array();
|
||
$hi = false;
|
||
foreach (unpack($types[$type] . '*', $s) as $codepoint)
|
||
{
|
||
#surrogate process
|
||
if ($hi !== false)
|
||
{
|
||
$lo = $codepoint;
|
||
if ($lo < 0xDC00 || $lo > 0xDFFF) $a[] = "\xEF\xBF\xBD"; #U+FFFD REPLACEMENT CHARACTER (for broken char)
|
||
else
|
||
{
|
||
$codepoint = (($hi - 0xD800) * 0x400) + ($lo - 0xDC00) + 0x10000;
|
||
$a[] = self::chr($codepoint);
|
||
}
|
||
$hi = false;
|
||
}
|
||
elseif ($codepoint < 0xD800 || $codepoint > 0xDBFF) $a[] = self::chr($codepoint); #not surrogate
|
||
else $hi = $codepoint; #surrogate was found
|
||
}
|
||
return $to_array ? $a : implode('', $a);
|
||
}
|
||
|
||
/**
|
||
* Strips out device control codes in the ASCII range.
|
||
*
|
||
* @param array|scalar|null $data Data to clean
|
||
* @return array|scalar|null Returns FALSE if error occurred
|
||
*/
|
||
public static function strict($data)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (is_array($data))
|
||
{
|
||
$d = array();
|
||
foreach ($data as $k => &$v)
|
||
{
|
||
if (is_string($k))
|
||
{
|
||
$k = self::strict($k);
|
||
if (! is_string($k)) return false;
|
||
}
|
||
$d[$k] = self::strict($v);
|
||
if ($d[$k] === false && ! is_bool($v)) return false;
|
||
}
|
||
return $d;
|
||
}
|
||
if (is_string($data)) return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F]+/sSX', '', $data);
|
||
if (is_scalar($data) || is_null($data)) return $data; #int/float/bool/null
|
||
return false; #object or resource
|
||
}
|
||
|
||
/**
|
||
* Check the data accessory to the class of control characters in ASCII.
|
||
* For non string always returns FALSE.
|
||
*
|
||
* @param scalar|null $data
|
||
* @param int|null $found_char_offset Returns the offset for the first found binary symbol
|
||
* @return bool
|
||
*/
|
||
public static function has_binary($data, &$found_char_offset = null)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
#[\t\n\r] = [\x09\x0a\x0d]
|
||
#[\x00-\x1f\x7f](?<![\t\n\r]) = [\x00-\x08\x0b\x0c\x0e-\x1f\x7f] = [^\x09\x0a\x0d\x20-\x7e\x80-\xff]
|
||
if (! is_string($data) ||
|
||
#search a binary char
|
||
! preg_match('~[\x00-\x1f\x7f](?<![\t\n\r])~sSX', $data, $m, PREG_OFFSET_CAPTURE)) return false;
|
||
$found_char_offset = self::strlen(substr($data, 0, $m[0][1]));
|
||
return true;
|
||
}
|
||
|
||
/**
|
||
* Check the data accessory to the class of characters ASCII.
|
||
* For non string/int/float always returns FALSE
|
||
*
|
||
* @param scalar|null $data
|
||
* @param int|null $error_char_offset Returns the offset for the first found non ASCII symbol
|
||
* @return bool
|
||
*/
|
||
public static function is_ascii($data, &$error_char_offset = null)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (is_string($data))
|
||
{
|
||
if (! preg_match('~[\x80-\xff]~sSX', $data, $m, PREG_OFFSET_CAPTURE)) return true;
|
||
$error_char_offset = $m[0][1];
|
||
return false;
|
||
}
|
||
if (is_int($data) || is_float($data)) return true;
|
||
return false;
|
||
}
|
||
|
||
/**
|
||
* Returns true if data is valid UTF-8 and false otherwise.
|
||
* For null, integer, float, boolean returns TRUE.
|
||
*
|
||
* The arrays are traversed recursively, if At least one element of the array
|
||
* its value is not in UTF-8, returns FALSE.
|
||
*
|
||
* @link http://www.w3.org/International/questions/qa-forms-utf-8.html
|
||
* @link http://ru3.php.net/mb_detect_encoding
|
||
* @link http://webtest.philigon.ru/articles/utf8/
|
||
* @link http://unicode.coeurlumiere.com/
|
||
* @param array|scalar|null $data
|
||
* @param bool $is_strict strict the range of ASCII?
|
||
* @return bool
|
||
*/
|
||
public static function is_utf8($data, $is_strict = true)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (is_string($data))
|
||
{
|
||
if (preg_match('~~suSX', $data) !== 1) return false;
|
||
//if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return false;
|
||
//preg_match('~~suSX') much faster (up to 4 times), then mb_check_encoding($data, 'UTF-8')!
|
||
//if (function_exists('mb_check_encoding') && ! mb_check_encoding($data, 'UTF-8')) return false; #DEPRECATED
|
||
/**
|
||
* Специальные символы по спецификации JSON (http://json.org/)
|
||
* \b represents the backspace character (U+0008)
|
||
* \t represents the character tabulation character (U+0009)
|
||
* \n represents the line feed character (U+000A)
|
||
* \f represents the form feed character (U+000C)
|
||
* \r represents the carriage return character (U+000D)
|
||
*/
|
||
//с данным регулярным выражением preg_match() работает в 2 раза быстрее, чем strpbrk()
|
||
if ($is_strict && preg_match('/[^\x08\x09\x0A\x0C\x0D\x20-\xBF\xC2-\xF7]/sSX', $data)) {
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
if (is_scalar($data) || is_null($data)) return true; #int/float/bool/null
|
||
if (is_array($data))
|
||
{
|
||
foreach ($data as $k => &$v)
|
||
{
|
||
if (! self::is_utf8($k, $is_strict) || ! self::is_utf8($v, $is_strict)) return false;
|
||
}
|
||
return true;
|
||
}
|
||
return false; #object or resource
|
||
}
|
||
|
||
/**
|
||
* Tries to detect if a string is in Unicode encoding
|
||
*
|
||
* @deprecated Slowly, use self::is_utf8() instead
|
||
* @see self::is_utf8()
|
||
* @param string $s текст
|
||
* @param bool $is_strict строгая проверка диапазона ASCII?
|
||
* @return bool
|
||
*/
|
||
public static function check($s, $is_strict = true)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
for ($i = 0, $len = strlen($s); $i < $len; $i++)
|
||
{
|
||
$c = ord($s[$i]);
|
||
if ($c < 0x80) #1 byte 0bbbbbbb
|
||
{
|
||
if ($is_strict === false || ($c > 0x1F && $c < 0x7F) || $c == 0x09 || $c == 0x0A || $c == 0x0D) continue;
|
||
}
|
||
if (($c & 0xE0) == 0xC0) $n = 1; #2 bytes 110bbbbb 10bbbbbb
|
||
elseif (($c & 0xF0) == 0xE0) $n = 2; #3 bytes 1110bbbb 10bbbbbb 10bbbbbb
|
||
elseif (($c & 0xF8) == 0xF0) $n = 3; #4 bytes 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
|
||
elseif (($c & 0xFC) == 0xF8) $n = 4; #5 bytes 111110bb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb
|
||
elseif (($c & 0xFE) == 0xFC) $n = 5; #6 bytes 1111110b 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb
|
||
else return false; #does not match any model
|
||
#n bytes matching 10bbbbbb follow ?
|
||
for ($j = 0; $j < $n; $j++)
|
||
{
|
||
$i++;
|
||
if ($i == $len || ((ord($s[$i]) & 0xC0) != 0x80) ) return false;
|
||
}
|
||
}
|
||
return true;
|
||
}
|
||
|
||
/**
|
||
* Check the data in UTF-8 charset on given ranges of the standard UNICODE.
|
||
* The suitable alternative to regular expressions.
|
||
*
|
||
* For null, integer, float, boolean returns TRUE.
|
||
*
|
||
* Arrays traversed recursively (keys and values).
|
||
* At least if one array element value is not passed checking, it returns FALSE.
|
||
*
|
||
* @example
|
||
* #A simple check the standard named ranges:
|
||
* UTF8::blocks_check('поисковые системы Google и Yandex', array('Basic Latin', 'Cyrillic'));
|
||
* #You can check the named, direct ranges or codepoints together:
|
||
* UTF8::blocks_check('поисковые системы Google и Yandex', array(array(0x20, 0x7E), #[\x20-\x7E]
|
||
* array(0x0410, 0x044F), #[A-Яa-я]
|
||
* 0x0401, #russian yo (Ё)
|
||
* 0x0451, #russian ye (ё)
|
||
* 'Arrows',
|
||
* ));
|
||
*
|
||
* @link http://www.unicode.org/charts/
|
||
* @param array|scalar|null $data
|
||
* @param array|string $blocks
|
||
* @return bool Возвращает TRUE, если все символы из текста принадлежат указанным диапазонам
|
||
* и FALSE в противном случае или для разбитого UTF-8.
|
||
*/
|
||
public static function blocks_check($data, $blocks)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
|
||
if (is_array($data))
|
||
{
|
||
foreach ($data as $k => &$v)
|
||
{
|
||
if (! self::blocks_check($k, $blocks) || ! self::blocks_check($v, $blocks)) return false;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
if (is_int($data)) $data = strval($data);
|
||
elseif (is_float($data)) $data = str_replace(',', '.', strval($data));
|
||
elseif (! is_string($data)) return false;
|
||
|
||
$chars = self::str_split($data);
|
||
if ($chars === false) return false; #broken UTF-8
|
||
unset($data); #memory free
|
||
$skip = array(); #save to cache already checked symbols
|
||
foreach ($chars as $i => $char)
|
||
{
|
||
if (array_key_exists($char, $skip)) continue; #speed improve
|
||
$codepoint = self::ord($char);
|
||
if (! is_int($codepoint)) return false; #broken UTF-8?
|
||
$is_valid = false;
|
||
$blocks = (array)$blocks;
|
||
foreach ($blocks as $j => $block)
|
||
{
|
||
if (is_string($block))
|
||
{
|
||
if (! array_key_exists($block, self::$unicode_blocks))
|
||
{
|
||
trigger_error('Unknown block "' . $block . '"!', E_USER_WARNING);
|
||
return false;
|
||
}
|
||
list ($min, $max) = self::$unicode_blocks[$block];
|
||
}
|
||
elseif (is_array($block)) list ($min, $max) = $block;
|
||
elseif (is_int($block)) $min = $max = $block;
|
||
else trigger_error('A string/array/int type expected for block[' . $j . ']!', E_USER_ERROR);
|
||
if ($codepoint >= $min && $codepoint <= $max)
|
||
{
|
||
$is_valid = true;
|
||
break;
|
||
}
|
||
}
|
||
if (! $is_valid) return false;
|
||
$skip[$char] = null;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
/**
|
||
* Сравнение строк
|
||
*
|
||
* @param string|null $s1
|
||
* @param string|null $s2
|
||
* @param string $locale For example, 'en_CA', 'ru_RU'
|
||
* @return int|bool|null Returns FALSE if error occurred
|
||
* Returns < 0 if $s1 is less than $s2;
|
||
* > 0 if $s1 is greater than $s2;
|
||
* 0 if they are equal.
|
||
*/
|
||
public static function strcmp($s1, $s2, $locale = '')
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s1) || ! is_string($s2)) return null;
|
||
if (! function_exists('collator_create')) return strcmp($s1, $s2);
|
||
# PHP 5 >= 5.3.0, PECL intl >= 1.0.0
|
||
# If empty string ("") or "root" are passed, UCA rules will be used.
|
||
$c = new Collator($locale);
|
||
if (! $c)
|
||
{
|
||
# Returns an "empty" object on error. You can use intl_get_error_code() and/or intl_get_error_message() to know what happened.
|
||
trigger_error(intl_get_error_message(), E_USER_WARNING);
|
||
return false;
|
||
}
|
||
return $c->compare($s1, $s2);
|
||
}
|
||
|
||
/**
|
||
* Сравнение строк для N первых символов
|
||
*
|
||
* @param string|null $s1
|
||
* @param string|null $s2
|
||
* @param int $length
|
||
* @return int|bool|null Returns FALSE if error occurred
|
||
* Returns < 0 if $s1 is less than $s2;
|
||
* > 0 if $s1 is greater than $s2;
|
||
* 0 if they are equal.
|
||
*/
|
||
public static function strncmp($s1, $s2, $length)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s1) || ! is_string($s2)) return null;
|
||
return self::strcmp(self::substr($s1, 0, $length), self::substr($s2, 0, $length));
|
||
}
|
||
|
||
/**
|
||
* Implementation strcasecmp() function for UTF-8 encoding string.
|
||
*
|
||
* @param string|null $s1
|
||
* @param string|null $s2
|
||
* @return int|bool|null Returns FALSE if error occurred
|
||
* Returns < 0 if $s1 is less than $s2;
|
||
* > 0 if $s1 is greater than $s2;
|
||
* 0 if they are equal.
|
||
*/
|
||
public static function strcasecmp($s1, $s2)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s1) || ! is_string($s2)) return null;
|
||
return self::strcmp(self::lowercase($s1), self::lowercase($s2));
|
||
}
|
||
|
||
/**
|
||
* Converts a UTF-8 string to a UNICODE codepoints
|
||
*
|
||
* @param string|null $s UTF-8 string
|
||
* @return array|bool|null Unicode codepoints
|
||
* Returns FALSE if $s broken (not UTF-8)
|
||
*/
|
||
public static function to_unicode($s)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s) || $s === '') return $s;
|
||
|
||
$s2 = null;
|
||
#since PHP-5.3.x iconv() little faster then mb_convert_encoding()
|
||
if (function_exists('iconv')) $s2 = @iconv('UTF-8', 'UCS-4BE', $s);
|
||
elseif (function_exists('mb_convert_encoding')) $s2 = @mb_convert_encoding($s, 'UCS-4BE', 'UTF-8');
|
||
if (is_string($s2)) return array_values(unpack('N*', $s2));
|
||
if ($s2 !== null) return false;
|
||
|
||
$a = self::str_split($s);
|
||
if (! is_array($a)) return false;
|
||
return array_map(array(__CLASS__, 'ord'), $a);
|
||
}
|
||
|
||
/**
|
||
* Converts a UNICODE codepoints to a UTF-8 string
|
||
*
|
||
* @param array|null $a Unicode codepoints
|
||
* @return string|bool|null UTF-8 string
|
||
* Returns FALSE if error occurred
|
||
*/
|
||
public static function from_unicode($a)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_array($a)) return $a;
|
||
|
||
#since PHP-5.3.x iconv() little faster then mb_convert_encoding()
|
||
if (function_exists('iconv'))
|
||
{
|
||
array_walk($a, function(&$cp) { $cp = pack('N', $cp); });
|
||
$s = @iconv('UCS-4BE', 'UTF-8', implode('', $a));
|
||
if (! is_string($s)) return false;
|
||
return $s;
|
||
}
|
||
if (function_exists('mb_convert_encoding'))
|
||
{
|
||
array_walk($a, function(&$cp) { $cp = pack('N', $cp); });
|
||
$s = mb_convert_encoding(implode('', $a), 'UTF-8', 'UCS-4BE');
|
||
if (! is_string($s)) return false;
|
||
return $s;
|
||
}
|
||
|
||
return implode('', array_map(array(__CLASS__, 'chr'), $a));
|
||
}
|
||
|
||
/**
|
||
* Converts a UTF-8 character to a UNICODE codepoint
|
||
*
|
||
* @param string|null $char UTF-8 character
|
||
* @return int|bool|null Unicode codepoint
|
||
* Returns FALSE if $char broken (not UTF-8)
|
||
*/
|
||
public static function ord($char)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($char)) return $char;
|
||
|
||
static $cache = array();
|
||
if (array_key_exists($char, $cache)) return $cache[$char]; #speed improve
|
||
|
||
switch (strlen($char))
|
||
{
|
||
case 1 : return $cache[$char] = ord($char);
|
||
case 2 : return $cache[$char] = (ord($char{1}) & 63) |
|
||
((ord($char{0}) & 31) << 6);
|
||
case 3 : return $cache[$char] = (ord($char{2}) & 63) |
|
||
((ord($char{1}) & 63) << 6) |
|
||
((ord($char{0}) & 15) << 12);
|
||
case 4 : return $cache[$char] = (ord($char{3}) & 63) |
|
||
((ord($char{2}) & 63) << 6) |
|
||
((ord($char{1}) & 63) << 12) |
|
||
((ord($char{0}) & 7) << 18);
|
||
default :
|
||
trigger_error('Character 0x' . bin2hex($char) . ' is not UTF-8!', E_USER_WARNING);
|
||
return false;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Converts a UNICODE codepoint to a UTF-8 character
|
||
*
|
||
* @param int|digit|null $cp Unicode codepoint
|
||
* @return string|bool|null UTF-8 character
|
||
* Returns FALSE if error occurred
|
||
*/
|
||
public static function chr($cp)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_int($cp) && ! ctype_digit($cp)) return $cp;
|
||
|
||
static $cache = array();
|
||
if (array_key_exists($cp, $cache)) return $cache[$cp]; #speed improve
|
||
|
||
if ($cp <= 0x7f) return $cache[$cp] = chr($cp);
|
||
if ($cp <= 0x7ff) return $cache[$cp] = chr(0xc0 | ($cp >> 6)) .
|
||
chr(0x80 | ($cp & 0x3f));
|
||
if ($cp <= 0xffff) return $cache[$cp] = chr(0xe0 | ($cp >> 12)) .
|
||
chr(0x80 | (($cp >> 6) & 0x3f)) .
|
||
chr(0x80 | ($cp & 0x3f));
|
||
if ($cp <= 0x10ffff) return $cache[$cp] = chr(0xf0 | ($cp >> 18)) .
|
||
chr(0x80 | (($cp >> 12) & 0x3f)) .
|
||
chr(0x80 | (($cp >> 6) & 0x3f)) .
|
||
chr(0x80 | ($cp & 0x3f));
|
||
#U+FFFD REPLACEMENT CHARACTER
|
||
return $cache[$cp] = "\xEF\xBF\xBD";
|
||
}
|
||
|
||
/**
|
||
* Implementation chunk_split() function for UTF-8 encoding string.
|
||
*
|
||
* @param string|null $s
|
||
* @param int|digit|null $length
|
||
* @param string|null $glue
|
||
* @return string|bool|null Returns FALSE if error occurred
|
||
*/
|
||
public static function chunk_split($s, $length = null, $glue = null)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s) || $s === '') return $s;
|
||
|
||
$length = intval($length);
|
||
$glue = strval($glue);
|
||
if ($length < 1) $length = 76;
|
||
if ($glue === '') $glue = "\r\n";
|
||
$a = self::str_split($s, $length);
|
||
if (! is_array($a)) return false;
|
||
return implode($glue, $a);
|
||
}
|
||
|
||
/**
|
||
* Changes all keys in an array
|
||
*
|
||
* @param array|null $a
|
||
* @param int $mode {CASE_LOWER|CASE_UPPER}
|
||
* @param bool $is_recursive
|
||
* @return array|bool|null Returns FALSE if error occurred
|
||
*/
|
||
public static function array_change_key_case($a, $mode, $is_recursive = false)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_array($a)) return $a;
|
||
|
||
$a2 = array();
|
||
foreach ($a as $k => $v)
|
||
{
|
||
if (is_string($k))
|
||
{
|
||
$k = self::convert_case($k, $mode);
|
||
if ($k === false) return false;
|
||
}
|
||
if ($is_recursive && is_array($v)) #recursive support
|
||
{
|
||
$v = self::array_change_key_case($v, $mode, $is_recursive);
|
||
if (! is_array($v)) return false;
|
||
}
|
||
$a2[$k] = $v;
|
||
}
|
||
return $a2;
|
||
}
|
||
|
||
/**
|
||
* Конвертирует регистр букв в данных в кодировке UTF-8.
|
||
* Массивы обходятся рекурсивно, при этом конвертируются только значения
|
||
* в элементах массива, а ключи остаются без изменений.
|
||
* Для конвертирования только ключей используйте метод self::array_change_key_case().
|
||
*
|
||
* @see self::array_change_key_case()
|
||
* @link http://www.unicode.org/charts/PDF/U0400.pdf
|
||
* @link http://ru.wikipedia.org/wiki/ISO_639-1
|
||
* @param array|scalar|null $data Данные произвольной структуры
|
||
* @param int $mode {CASE_LOWER|CASE_UPPER}
|
||
* @param bool $is_ascii_optimization for speed improve
|
||
* @return scalar|bool|null Returns FALSE if error occurred
|
||
*/
|
||
public static function convert_case($data, $mode, $is_ascii_optimization = true)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
|
||
if (is_array($data)) #recursive support
|
||
{
|
||
foreach ($data as $k => $v)
|
||
{
|
||
$data[$k] = self::convert_case($v, $mode);
|
||
if ($data[$k] === false && ! is_bool($v)) return false;
|
||
}
|
||
return $data;
|
||
}
|
||
if (! is_string($data) || ! $data) return $data;
|
||
|
||
if ($mode === CASE_UPPER)
|
||
{
|
||
if ($is_ascii_optimization && self::is_ascii($data)) return strtoupper($data); #speed improve!
|
||
#deprecated, since PHP-5.3.x strtr() 2-3 times faster then mb_strtolower()
|
||
#if (function_exists('mb_strtoupper')) return mb_strtoupper($data, 'utf-8');
|
||
return strtr($data, array_flip(self::$convert_case_table));
|
||
}
|
||
if ($mode === CASE_LOWER)
|
||
{
|
||
if ($is_ascii_optimization && self::is_ascii($data)) return strtolower($data); #speed improve!
|
||
#deprecated, since PHP-5.3.x strtr() 2-3 times faster then mb_strtolower()
|
||
#if (function_exists('mb_strtolower')) return mb_strtolower($data, 'utf-8');
|
||
return strtr($data, self::$convert_case_table);
|
||
}
|
||
trigger_error('Parameter 2 should be a constant of CASE_LOWER or CASE_UPPER!', E_USER_WARNING);
|
||
return $data;
|
||
}
|
||
|
||
/**
|
||
* Convert a data to lower case
|
||
*
|
||
* @param array|scalar|null $data
|
||
* @return scalar|bool|null Returns FALSE if error occurred */
|
||
public static function lowercase($data)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
return self::convert_case($data, CASE_LOWER);
|
||
}
|
||
|
||
/**
|
||
* Convert a data to upper case
|
||
*
|
||
* @param array|scalar|null $data
|
||
* @return scalar|null Returns FALSE if error occurred
|
||
*/
|
||
public static function uppercase($data)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
return self::convert_case($data, CASE_UPPER);
|
||
}
|
||
|
||
/**
|
||
* Convert a data to lower case
|
||
*
|
||
* @param array|scalar|null $data
|
||
* @return scalar|bool|null Returns FALSE if error occurred
|
||
*/
|
||
public static function strtolower($data)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
return self::convert_case($data, CASE_LOWER);
|
||
}
|
||
|
||
/**
|
||
* Convert a data to upper case
|
||
*
|
||
* @param array|scalar|null $data
|
||
* @return scalar|null Returns FALSE if error occurred
|
||
*/
|
||
public static function strtoupper($data)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
return self::convert_case($data, CASE_UPPER);
|
||
}
|
||
|
||
|
||
/**
|
||
* Convert all HTML entities to native UTF-8 characters
|
||
* Функция декодирует гораздо больше именованных сущностей, чем стандартная html_entity_decode()
|
||
* Все dec и hex сущности так же переводятся в UTF-8.
|
||
*
|
||
* Example: '"' or '"' or '"' will be converted to '"'.
|
||
*
|
||
* @link http://www.htmlhelp.com/reference/html40/entities/
|
||
* @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References)
|
||
* @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true
|
||
* @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true
|
||
* @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true
|
||
*
|
||
* @param scalar|null $s
|
||
* @param bool $is_special_chars Дополнительно обрабатывать специальные html сущности? (< > & " ')
|
||
* @return scalar|null Returns FALSE if error occurred
|
||
*/
|
||
public static function html_entity_decode($s, $is_special_chars = false)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s) || $s === '') return $s;
|
||
|
||
#speed improve
|
||
if (strlen($s) < 4 #по минимальной длине сущности - 4 байта: &#d; &xx;
|
||
|| ($pos = strpos($s, '&') === false) || strpos($s, ';', $pos) === false) return $s;
|
||
|
||
$table = self::$html_entity_table;
|
||
if ($is_special_chars)
|
||
{
|
||
$table += self::$html_special_chars_table
|
||
+ array(
|
||
#' entity is only available in XHTML/HTML5 and not in plain HTML, see http://www.w3.org/TR/xhtml1/#C_16
|
||
''' => "\x27", #U+0027 ['] ' apostrophe
|
||
);
|
||
}
|
||
#replace named entities
|
||
$s = strtr($s, $table);
|
||
#block below deprecated, since PHP-5.3.x strtr() 1.5 times faster
|
||
if (0 && preg_match_all('/&[a-zA-Z]++\d*+;/sSX', $s, $m, null, $pos))
|
||
{
|
||
foreach (array_unique($m[0]) as $entity)
|
||
{
|
||
if (array_key_exists($entity, $table)) $s = str_replace($entity, $table[$entity], $s);
|
||
}
|
||
}
|
||
|
||
#заменяем числовые dec и hex сущности:
|
||
if (strpos($s, '&#') !== false) #speed improve
|
||
{
|
||
$class = __CLASS__;
|
||
$html_special_chars_table_flipped = array_flip(self::$html_special_chars_table);
|
||
$s = preg_replace_callback('/&#((x)[\da-fA-F]{1,6}+|\d{1,7}+);/sSX',
|
||
function (array $m) use ($class, $html_special_chars_table_flipped, $is_special_chars)
|
||
{
|
||
$codepoint = isset($m[2]) && $m[2] === 'x' ? hexdec($m[1]) : $m[1];
|
||
if (! $is_special_chars)
|
||
{
|
||
$char = pack('C', $codepoint);
|
||
if (array_key_exists($char, $html_special_chars_table_flipped)) return $html_special_chars_table_flipped[$char];
|
||
}
|
||
return $class::chr($codepoint);
|
||
}, $s);
|
||
}
|
||
return $s;
|
||
}
|
||
|
||
/**
|
||
* Convert special UTF-8 characters to HTML entities.
|
||
* Функция кодирует гораздо больше именованных сущностей, чем стандартная htmlentities()
|
||
*
|
||
* @link http://www.htmlhelp.com/reference/html40/entities/
|
||
* @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References)
|
||
* @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true
|
||
* @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true
|
||
* @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true
|
||
*
|
||
* @param scalar|null $s
|
||
* @param bool $is_special_chars_only Обрабатывать только специальные html сущности? (< > & ")
|
||
* @return scalar|null Returns FALSE if error occurred
|
||
*/
|
||
public static function html_entity_encode($s, $is_special_chars_only = false)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s) || $s === '') return $s;
|
||
|
||
if ($is_special_chars_only) return strtr($s, array_flip(self::$html_special_chars_table)); #binary support
|
||
#if ($is_special_chars_only) return htmlspecialchars($s); #DEPRECATED, charset dependent
|
||
|
||
#replace UTF-8 chars to named entities:
|
||
$s = strtr($s, array_flip(self::$html_entity_table));
|
||
|
||
#block below deprecated, since PHP-5.3.x strtr() 3 times faster
|
||
if (0 && preg_match_all('~(?> [\xc2\xc3\xc5\xc6\xcb\xce\xcf][\x80-\xbf] #2 bytes
|
||
| \xe2[\x80-\x99][\x82-\xac] #3 bytes
|
||
)
|
||
~sxSX', $s, $m))
|
||
{
|
||
$table = array_flip(self::$html_entity_table);
|
||
foreach (array_unique($m[0]) as $char)
|
||
{
|
||
if (array_key_exists($char, $table)) $s = str_replace($char, $table[$char], $s);
|
||
}
|
||
}
|
||
|
||
return $s;
|
||
}
|
||
|
||
/**
|
||
* Make regular expression for case insensitive match
|
||
* Example (only digits): "123" => "123"
|
||
* Example (only ASCII): "123_test" => "(?i:123_test)"
|
||
* Example (upper ASCII): "123_слово_test" => "123_(с|С)(л|Л)(о|О)(в|В)(о|О)_[tT][eE][sS][tT]"
|
||
*
|
||
* @param string|null $s
|
||
* @param string|null $delimiter If the optional delimiter is specified, it will also be escaped.
|
||
* This is useful for escaping the delimiter that is required by the PCRE functions.
|
||
* The / is the most commonly used delimiter.
|
||
* @return string|bool|null Returns FALSE if error occurred
|
||
*/
|
||
public static function preg_quote_case_insensitive($s, $delimiter = null)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s) || $s === '') return $s;
|
||
|
||
if (ctype_digit($s)) return preg_quote($s, $delimiter); #speed improve
|
||
if (self::is_ascii($s)) return '(?i:' . preg_quote($s, $delimiter) . ')'; #speed improve
|
||
|
||
$s_lc = self::convert_case($s, CASE_LOWER, false); if ($s_lc === false) return false;
|
||
$s_uc = self::convert_case($s, CASE_UPPER, false); if ($s_uc === false) return false;
|
||
if ($s_lc === $s_uc) return preg_quote($s, $delimiter); #speed improve
|
||
|
||
$chars_lc = self::str_split($s_lc); if ($chars_lc === false) return false;
|
||
$chars_uc = self::str_split($s_uc); if ($chars_uc === false) return false;
|
||
|
||
$s_re = '';
|
||
foreach ($chars_lc as $i => $char)
|
||
{
|
||
if ($chars_lc[$i] === $chars_uc[$i])
|
||
$s_re .= preg_quote($chars_lc[$i], $delimiter);
|
||
elseif (strlen($chars_lc[$i]) === 1 /*self::is_ascii($chars_lc[$i])*/)
|
||
$s_re .= '[' . self::_preg_quote_class($chars_lc[$i] . $chars_uc[$i], $delimiter) . ']';
|
||
else
|
||
#для русских и др. букв, т. к. флаг /u и (?i:слово) не помогают :(
|
||
$s_re .= '(' . preg_quote($chars_lc[$i], $delimiter) . '|'
|
||
. preg_quote($chars_uc[$i], $delimiter) . ')';
|
||
}
|
||
return $s_re;
|
||
}
|
||
|
||
/**
|
||
* Call preg_match_all() and convert byte offsets into character offsets for PREG_OFFSET_CAPTURE flag.
|
||
* This is regardless of whether you use /u modifier.
|
||
*
|
||
* @link http://bolknote.ru/2010/09/08/~2704
|
||
*
|
||
* @param string $pattern
|
||
* @param string|null $subject
|
||
* @param array $matches
|
||
* @param int $flags
|
||
* @param int $char_offset
|
||
* @return array|bool|null Returns FALSE if error occurred
|
||
*/
|
||
public static function preg_match_all($pattern, $subject, &$matches, $flags = PREG_PATTERN_ORDER, $char_offset = 0)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($subject)) return $subject;
|
||
|
||
$byte_offset = ($char_offset > 0) ? strlen(self::substr($subject, 0, $char_offset)) : $char_offset;
|
||
|
||
$return = preg_match_all($pattern, $subject, $matches, $flags, $byte_offset);
|
||
if ($return === false) return false;
|
||
|
||
if ($flags & PREG_OFFSET_CAPTURE)
|
||
{
|
||
foreach ($matches as &$match)
|
||
{
|
||
foreach ($match as &$a) $a[1] = self::strlen(substr($subject, 0, $a[1]));
|
||
}
|
||
}
|
||
|
||
return $return;
|
||
}
|
||
|
||
#alias for self::str_limit()
|
||
public static function truncate($s, $maxlength = null, $continue = "\xe2\x80\xa6", &$is_cutted = null, $tail_min_length = 20)
|
||
{
|
||
return self::str_limit($s, $maxlength, $continue, $is_cutted, $tail_min_length);
|
||
}
|
||
|
||
/**
|
||
* Обрезает текст в кодировке UTF-8 до заданной длины,
|
||
* причём последнее слово показывается целиком, а не обрывается на середине.
|
||
* Html сущности корректно обрабатываются.
|
||
*
|
||
* @param string|null $s Текст в кодировке UTF-8
|
||
* @param int|null|digit $maxlength Ограничение длины текста
|
||
* @param string $continue Завершающая строка, которая будет вставлена после текста, если он обрежется
|
||
* @param bool|null &$is_cutted Текст был обрезан?
|
||
* @param int|digit $tail_min_length Если длина "хвоста", оставшегося после обрезки текста, меньше $tail_min_length,
|
||
* то текст возвращается без изменений
|
||
* @return string|bool|null Returns FALSE if error occurred
|
||
*/
|
||
public static function str_limit($s, $maxlength = null, $continue = "\xe2\x80\xa6", &$is_cutted = null, $tail_min_length = 20) #"\xe2\x80\xa6" = "…"
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s) || $s === '') return $s;
|
||
|
||
$is_cutted = false;
|
||
if ($continue === null) $continue = "\xe2\x80\xa6";
|
||
if (! $maxlength) $maxlength = 256;
|
||
|
||
#speed improve block
|
||
#{{{
|
||
if (strlen($s) <= $maxlength) return $s;
|
||
$s2 = str_replace("\r\n", '?', $s);
|
||
$s2 = preg_replace('~' . self::HTML_ENTITY_RE . '~sxSX', '?', $s2);
|
||
if (strlen($s2) <= $maxlength || self::strlen($s2) <= $maxlength) return $s;
|
||
#}}}
|
||
|
||
$r = preg_match_all('~(?> \r\n # next line
|
||
| ' . self::HTML_ENTITY_RE . '
|
||
| .
|
||
)
|
||
~sxuSX', $s, $m);
|
||
if ($r === false) return false;
|
||
|
||
#d($m);
|
||
if (count($m[0]) <= $maxlength) return $s;
|
||
|
||
$left = implode('', array_slice($m[0], 0, $maxlength));
|
||
#из диапазона ASCII исключаем буквы, цифры, открывающие парные символы [a-zA-Z\d\(\{\[] и некоторые др. символы
|
||
#нельзя вырезать в конце строки символ ";", т.к. он используются в сущностях &xxx;
|
||
$left2 = rtrim($left, "\x00..\x28\x2A..\x2F\x3A\x3C..\x3E\x40\x5B\x5C\x5E..\x60\x7B\x7C\x7E\x7F");
|
||
if (strlen($left) !== strlen($left2)) $return = $left2 . $continue;
|
||
else
|
||
{
|
||
#добавляем остаток к обрезанному слову
|
||
$right = implode('', array_slice($m[0], $maxlength));
|
||
preg_match('/^(?>
|
||
#цифры, закрывающие парные символы, дефис для составных слов, дата, время, IP-адреса, URL типа www.ya.ru:80!
|
||
[\d\)\]\}\-\.:]+
|
||
#letters
|
||
| \p{L}+
|
||
#quotation marks
|
||
| [' . implode('', self::$html_quotation_mark_table) . ']+
|
||
)+
|
||
/suxSX', $right, $m);
|
||
#d($m);
|
||
$right = isset($m[0]) ? rtrim($m[0], '.-') : '';
|
||
$return = $left . $right;
|
||
if (strlen($return) !== strlen($s)) $return .= $continue;
|
||
}
|
||
if (self::strlen($s) - self::strlen($return) < $tail_min_length) return $s;
|
||
|
||
$is_cutted = true;
|
||
return $return;
|
||
}
|
||
|
||
/**
|
||
* Implementation str_split() function for UTF-8 encoding string.
|
||
*
|
||
* @param string|null $s
|
||
* @param int|null|digit $length
|
||
* @return array|bool|null Returns FALSE if error occurred
|
||
*/
|
||
public static function str_split($s, $length = null)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s)) return $s;
|
||
|
||
$length = ($length === null) ? 1 : intval($length);
|
||
if ($length < 1) return false;
|
||
#there are limits in regexp for {min,max}!
|
||
if (preg_match_all('~.~suSX', $s, $m) === false) return false;
|
||
if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return false;
|
||
if ($length === 1) $a = $m[0];
|
||
else
|
||
{
|
||
$a = array();
|
||
for ($i = 0, $c = count($m[0]); $i < $c; $i += $length) $a[] = implode('', array_slice($m[0], $i, $length));
|
||
}
|
||
return $a;
|
||
}
|
||
|
||
/**
|
||
* Implementation strlen() function for UTF-8 encoding string.
|
||
*
|
||
* @param string|null $s
|
||
* @return int|bool|null Returns FALSE if error occurred
|
||
*/
|
||
public static function strlen($s)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s)) return $s;
|
||
|
||
//since PHP-5.3.x mb_strlen() faster then strlen(utf8_decode())
|
||
if (function_exists('mb_strlen')) return mb_strlen($s, 'utf-8');
|
||
|
||
/*
|
||
utf8_decode() converts characters that are not in ISO-8859-1 to '?', which, for the purpose of counting, is quite alright.
|
||
It's much faster than iconv_strlen()
|
||
Note: this function does not count bad UTF-8 bytes in the string - these are simply ignored
|
||
*/
|
||
return strlen(utf8_decode($s));
|
||
|
||
/*
|
||
#iconv_strlen() slowly then strlen(utf8_decode())
|
||
if (function_exists('iconv_strlen')) return iconv_strlen($s, 'utf-8');
|
||
|
||
#Do not count UTF-8 continuation bytes
|
||
#return strlen(preg_replace('/[\x80-\xBF]/sSX', '', $s));
|
||
|
||
#slowly then strlen(utf8_decode())
|
||
preg_match_all('~.~suSX', $str, $m);
|
||
return count($m[0]);
|
||
|
||
#slowly then preg_match_all() + count()
|
||
$n = 0;
|
||
for ($i = 0, $len = strlen($s); $i < $len; $i++)
|
||
{
|
||
$c = ord(substr($s, $i, 1));
|
||
if ($c < 0x80) $n++; #single-byte (0xxxxxx)
|
||
elseif (($c & 0xC0) == 0xC0) $n++; #multi-byte starting byte (11xxxxxx)
|
||
}
|
||
return $n;
|
||
*/
|
||
}
|
||
|
||
/**
|
||
* Implementation strpos() function for UTF-8 encoding string
|
||
*
|
||
* @param string|null $s The entire string
|
||
* @param string|int $needle The searched substring
|
||
* @param int|null $offset The optional offset parameter specifies the position from which the search should be performed
|
||
* @return int|bool|null Returns the numeric position of the first occurrence of needle in haystack.
|
||
* If needle is not found, will return FALSE.
|
||
*/
|
||
public static function strpos($s, $needle, $offset = null)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s)) return $s;
|
||
|
||
if ($offset === null || $offset < 0) $offset = 0;
|
||
#mb_strpos() faster then iconv_strpos()
|
||
if (function_exists('mb_strpos')) return mb_strpos($s, $needle, $offset, 'utf-8');
|
||
#iconv_strpos() deprecated, because slowly than self::strlen(substr())
|
||
#if (function_exists('iconv_strpos')) return iconv_strpos($s, $needle, $offset, 'utf-8');
|
||
$byte_pos = $offset;
|
||
do if (($byte_pos = strpos($s, $needle, $byte_pos)) === false) return false;
|
||
while (($char_pos = self::strlen(substr($s, 0, $byte_pos++))) < $offset);
|
||
return $char_pos;
|
||
}
|
||
|
||
/**
|
||
* Find position of first occurrence of a case-insensitive string.
|
||
*
|
||
* @param string|null $s The entire string
|
||
* @param string|int $needle The searched substring
|
||
* @param int|null $offset The optional offset parameter specifies the position from which the search should be performed
|
||
* @return int|bool|null Returns the numeric position of the first occurrence of needle in haystack.
|
||
* If needle is not found, will return FALSE.
|
||
*/
|
||
public static function stripos($s, $needle, $offset = null)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s)) return $s;
|
||
|
||
if ($offset === null || $offset < 0) $offset = 0;
|
||
if (function_exists('mb_stripos')) return mb_stripos($s, $needle, $offset, 'utf-8');
|
||
|
||
#optimization block (speed improve)
|
||
#{{{
|
||
$ascii_int = intval(self::is_ascii($s)) + intval(self::is_ascii($needle));
|
||
if ($ascii_int === 1) return false;
|
||
if ($ascii_int === 2) return stripos($s, $needle, $offset);
|
||
#}}}
|
||
|
||
$s = self::convert_case($s, CASE_LOWER, false);
|
||
if ($s === false) return false;
|
||
$needle = self::convert_case($needle, CASE_LOWER, false);
|
||
if ($needle === false) return false;
|
||
return self::strpos($s, $needle, $offset);
|
||
}
|
||
|
||
/**
|
||
* Implementation strrev() function for UTF-8 encoding string
|
||
*
|
||
* @param string|null $s
|
||
* @return string|bool|null Returns FALSE if error occurred
|
||
*/
|
||
public static function strrev($s)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s) || $s === '') return $s;
|
||
|
||
if (0) #TODO test speed
|
||
{
|
||
$s = self::_convert($s, 'UTF-8', 'UTF-32');
|
||
if (! is_string($s)) return false;
|
||
$s = implode('', array_reverse(str_split($s, 4)));
|
||
return self::_convert($s, 'UTF-32', 'UTF-8');
|
||
}
|
||
|
||
if (! is_array($a = self::str_split($s))) return false;
|
||
return implode('', array_reverse($a));
|
||
}
|
||
|
||
/**
|
||
* Implementation substr() function for UTF-8 encoding string.
|
||
*
|
||
* @link http://www.w3.org/International/questions/qa-forms-utf-8.html
|
||
* @param string|null $s
|
||
* @param int|digit $offset
|
||
* @param int|null|digit $length
|
||
* @return string|bool|null Returns FALSE if error occurred
|
||
*/
|
||
public static function substr($s, $offset, $length = null)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s)) return $s;
|
||
|
||
#since PHP-5.3.x mb_substr() faster then iconv_substr()
|
||
if (function_exists('mb_substr'))
|
||
{
|
||
if ($length === null) $length = self::strlen($s);
|
||
return mb_substr($s, $offset, $length, 'utf-8');
|
||
}
|
||
if (function_exists('iconv_substr'))
|
||
{
|
||
if ($length === null) $length = self::strlen($s);
|
||
return iconv_substr($s, $offset, $length, 'utf-8');
|
||
}
|
||
|
||
static $_s = null;
|
||
static $_a = null;
|
||
|
||
if ($_s !== $s) $_a = self::str_split($_s = $s);
|
||
if (! is_array($_a)) return false;
|
||
if ($length !== null) $a = array_slice($_a, $offset, $length);
|
||
else $a = array_slice($_a, $offset);
|
||
return implode('', $a);
|
||
}
|
||
|
||
/**
|
||
* Implementation substr_replace() function for UTF-8 encoding string.
|
||
*
|
||
* @param string|null $s
|
||
* @param string|int $replacement
|
||
* @param int|digit $start
|
||
* @param int|null $length
|
||
* @return string|bool|null Returns FALSE if error occurred
|
||
*/
|
||
public static function substr_replace($s, $replacement, $start, $length = null)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s) || $s === '') return $s;
|
||
|
||
$a = self::str_split($s);
|
||
if (! is_array($a)) return false;
|
||
array_splice($a, $start, $length, $replacement);
|
||
return implode('', $a);
|
||
}
|
||
|
||
/**
|
||
* Implementation ucfirst() function for UTF-8 encoding string.
|
||
* Преобразует первый символ строки в кодировке UTF-8 в верхний регистр.
|
||
* Корректно обрабатывает слова в кавычках, например: «северный поток» --> «Северный поток»
|
||
*
|
||
* @param string|null $s
|
||
* @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр?
|
||
* @return string|bool|null Returns FALSE if error occurred
|
||
*/
|
||
public static function ucfirst($s, $is_other_to_lowercase = true)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if ($s === '' || ! is_string($s)) return $s;
|
||
|
||
if (! preg_match('/^([' . implode('', self::$html_quotation_mark_table) . ']{1,2}+) #1 quotation marks
|
||
(\p{L}) #2 first letter
|
||
(.*+) #3 next letters
|
||
$/sxuSX', $s, $m)) return $s; #letters not found
|
||
return $m[1] . self::uppercase($m[2]) . ($is_other_to_lowercase ? self::lowercase($m[3]) : $m[3]);
|
||
}
|
||
|
||
/**
|
||
* Implementation ucwords() function for UTF-8 encoding string.
|
||
* Преобразует в верхний регистр первый символ каждого слова в строке в кодировке UTF-8,
|
||
* остальные символы каждого слова преобразуются в нижний регистр.
|
||
*
|
||
* @param string|null $s
|
||
* @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр?
|
||
* @param string $spaces_re
|
||
* @return string|bool|null Returns FALSE if error occurred
|
||
*/
|
||
public static function ucwords($s, $is_other_to_lowercase = true, $spaces_re = '~([\p{Z}\s]+)~suSX')
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if ($s === '' || ! is_string($s)) return $s;
|
||
|
||
$words = preg_split($spaces_re, $s, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
|
||
foreach ($words as $k => $word)
|
||
{
|
||
$words[$k] = self::ucfirst($word, $is_other_to_lowercase);
|
||
if ($words[$k] === false) return false;
|
||
}
|
||
return implode('', $words);
|
||
}
|
||
|
||
/**
|
||
* Decodes a string to UTF-8 string from some formats (can be mixed)
|
||
* Examples
|
||
* '%D1%82%D0%B5%D1%81%D1%82' => "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" #binary (regular)
|
||
* '0xD182D0B5D181D182' => "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" #binary (compact)
|
||
* '%u0442%u0435%u0441%u0442' => "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" #UCS-2 (U+0 — U+FFFF)
|
||
* '%u{442}%u{435}%u{0441}%u{00442}' => "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" #UTF-8 (U+0 — U+FFFFFF)
|
||
*
|
||
* It is used to decode the data in the format %uXXXX, encoded deprecated
|
||
* javascript's function encode(). Recommended to use encodeURIComponent().
|
||
* Obsolete format %uXXXX allows unicode only in the range of UCS-2, ie, U+0 to U+FFFF.
|
||
*
|
||
* @see urldecode()
|
||
* @param array|scalar|null $data
|
||
* @param bool $is_hex2bin Decode the HEX-data?
|
||
* Example: '0xD182D0B5D181D182' => "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"
|
||
* Hint: parameters in the URL address is sometimes
|
||
* convenient to encode not function rawurlencode($string),
|
||
* and use the following mechanism (encoded data is more compact):
|
||
* '0x' . bin2hex($string)
|
||
* @param bool $is_urldecode
|
||
* @return array|scalar|null Returns FALSE if error occurred
|
||
*/
|
||
public static function unescape($data, $is_hex2bin = false, $is_urldecode = true)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (is_array($data))
|
||
{
|
||
$d = array();
|
||
foreach ($data as $k => &$v)
|
||
{
|
||
if (is_string($k))
|
||
{
|
||
$k = self::unescape($k, $is_hex2bin, $is_urldecode);
|
||
if (! is_string($k)) return false;
|
||
}
|
||
$d[$k] = self::unescape($v, $is_hex2bin, $is_urldecode);
|
||
if ($d[$k] === false && ! is_bool($v)) return false;
|
||
}
|
||
return $d;
|
||
}
|
||
if (is_string($data))
|
||
{
|
||
#use strpos() for speed improving of regexp
|
||
if ($is_hex2bin && strpos($data, '0x') !== false)
|
||
{
|
||
$data = preg_replace_callback(
|
||
'~0x((?:[\da-fA-F]{2})+)~sSX',
|
||
function (array $m)
|
||
{
|
||
$s = pack('H' . strlen($m[1]), $m[1]); #hex2bin()
|
||
return rawurlencode($s);
|
||
},
|
||
$data);
|
||
}
|
||
if (strpos($data, '%u') !== false)
|
||
{
|
||
$class = __CLASS__;
|
||
$data = preg_replace_callback(
|
||
'~%u( [\da-fA-F]{4}+ #%uXXXX only UCS-2
|
||
| \{ [\da-fA-F]{1,6}+ \} #%u{XXXXXX} extended form for all UNICODE charts
|
||
)
|
||
~sxSX',
|
||
function (array $m) use ($class)
|
||
{
|
||
$codepoint = hexdec(trim($m[1], '{}'));
|
||
$char = $class::chr($codepoint);
|
||
return rawurlencode($char);
|
||
},
|
||
$data);
|
||
}
|
||
return $is_urldecode ? urldecode($data) : $data;
|
||
}
|
||
if (is_scalar($data) || is_null($data)) return $data; #~ null, integer, float, boolean
|
||
return false; #object or resource
|
||
}
|
||
|
||
/**
|
||
* 1) Corrects the global arrays $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES
|
||
* decoded values from %XX and extended %uXXXX / %u{XXXXXX} format,
|
||
* for example, through an outdated javascript function escape().
|
||
* Standard PHP5 cannot do it.
|
||
* 2) Recode $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES from $charset
|
||
* encoding to UTF-8, if necessary.
|
||
* A side effect is a positive protection against XSS attacks with
|
||
* non-printable characters on the vulnerable PHP function.
|
||
* Thus web forms can be sent to the server in 2-encoding: $charset and UTF-8.
|
||
* For example: ?тест[тест]=тест
|
||
* 3) If in the HTTP_COOKIE there are parameters with the same name,
|
||
* takes the last value (as in the QUERY_STRING), not the first.
|
||
* 4) Creates an array of $_POST for non-standard Content-Type, for example,
|
||
* "Content-Type: application/octet-stream". Standard PHP5 creates
|
||
* an array for "Content-Type: application/x-www-form-urlencoded"
|
||
* and "Content-Type: multipart/form-data".
|
||
*
|
||
* Examples
|
||
* '%F2%E5%F1%F2' => 'тест' #CP1251 (regular)
|
||
* '0xF2E5F1F2' => 'тест' #CP1251 (compact)
|
||
* '%D1%82%D0%B5%D1%81%D1%82' => 'тест' #UTF-8 (regular)
|
||
* '0xD182D0B5D181D182' => 'тест' #UTF-8 (compact)
|
||
* '%u0442%u0435%u0441%u0442' => 'тест' #UCS-2 (U+0 — U+FFFF)
|
||
* '%u{442}%u{435}%u{0441}%u{00442}' => 'тест' #UTF-8 (U+0 — U+FFFFFF)
|
||
*
|
||
* Сессии, куки и независимая авторизация на поддоменах.
|
||
*
|
||
* ПРИМЕР 1
|
||
* У рабочего сайта http://domain.com появились поддомены.
|
||
* Для кроссдоменной авторизации через механизм сессий имя хоста для COOKIE было изменено с "domain.com" на ".domain.com"
|
||
* В результате авторизация не работает. Решение: поменять имя сессии.
|
||
* Ещё помогает очистка COOKIE, но их принудительная очистка на тысячах пользовательских компьютеров проблематична.
|
||
* PHP не правильно (?) обрабатывает заголовок HTTP_COOKIE, если там встречаются параметры с одинаковым именем, но разными значениями.
|
||
* Пример запроса HTTP-заголовка клиентом: "Cookie: sid=chpgs2fiak-330mzqza; sid=cmz5tnp5zz-xlbbgqp"
|
||
* В этом случае сервер берёт первое значение, а не последнее.
|
||
* Хотя если в QUERY_STRING есть такая ситуация, всегда берётся последний параметр.
|
||
* В HTTP_COOKIE два параметра с одинаковым именем могут появиться, если отправить клиенту следующие HTTP-заголовки:
|
||
* "Set-Cookie: sid=chpgs2fiak-330mzqza; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=domain.com" (только domain.com)
|
||
* "Set-Cookie: sid=cmz6uqorzv-1bn35110; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.domain.com" (domain.com и все его поддомены)
|
||
*
|
||
* ПРИМЕР 2
|
||
* Есть рабочие сайты: http://domain.com (основной), http://admin.domain.com (админка),
|
||
* http://sub1.domain.com (подпроект 1), http://sub2.domain.com, (подпроект 2).
|
||
* Так же имеется сервер разработки http://dev.domain.com, на котором м. б. свои поддомены.
|
||
* Требуется сделать независимую кросс-доменную авторизацию для http://*.domain.com и http://*.dev.domain.com.
|
||
* Для сохранения статуса авторизации будем использовать сессию, имя и значение которой пишется в COOKIE.
|
||
* Т. к. домены http://*.dev.domain.com имеют пересечение с доменами http://*.domain.com,
|
||
* для независимой авторизации нужно использовать разные имена сессий!
|
||
* Пример HTTP заголовков ответа сервера:
|
||
* "Set-Cookie: sid=chpgs2fiak-330mzqza; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.domain.com" (.domain.com и все его поддомены)
|
||
* "Set-Cookie: sid.dev=cmz6uqorzv-1bn35110; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.dev.domain.com" (dev.domain.com и все его поддомены)
|
||
*
|
||
* @link http://tools.ietf.org/html/rfc2965 RFC 2965 - HTTP State Management Mechanism
|
||
* @param bool $is_hex2bin Decode the HEX-data?
|
||
* Example: '0xD182D0B5D181D182' => "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"
|
||
* Hint: parameters in the URL address is sometimes
|
||
* convenient to encode not function rawurlencode($string),
|
||
* and use the following mechanism (encoded data is more compact):
|
||
* '0x' . bin2hex($string)
|
||
* @param string $charset
|
||
* @return bool
|
||
*/
|
||
public static function unescape_request($is_hex2bin = false, $charset = 'ISO-8859-1')
|
||
{
|
||
$fixed = false;
|
||
#ATTENTION! HTTP_RAW_POST_DATA is only accessible when Content-Type of POST request is NOT default "application/x-www-form-urlencoded"!
|
||
$HTTP_RAW_POST_DATA = isset($_SERVER['REQUEST_METHOD']) && $_SERVER['REQUEST_METHOD'] === 'POST' ? (isset($GLOBALS['HTTP_RAW_POST_DATA']) ? $GLOBALS['HTTP_RAW_POST_DATA'] : @file_get_contents('php://input')) : null;
|
||
if (ini_get('always_populate_raw_post_data')) $GLOBALS['HTTP_RAW_POST_DATA'] = $HTTP_RAW_POST_DATA;
|
||
foreach (array( '_GET' => isset($_SERVER['QUERY_STRING']) ? $_SERVER['QUERY_STRING'] : null,
|
||
'_POST' => $HTTP_RAW_POST_DATA,
|
||
'_COOKIE' => isset($_SERVER['HTTP_COOKIE']) ? $_SERVER['HTTP_COOKIE'] : null,
|
||
'_FILES' => isset($_FILES) ? $_FILES : null,
|
||
) as $k => $v)
|
||
{
|
||
if (! is_string($v)) continue;
|
||
|
||
if ($k === '_COOKIE')
|
||
{
|
||
$v = preg_replace('/; *+/sSX', '&', $v);
|
||
unset($_COOKIE); #будем парсить HTTP_COOKIE сами, чтобы сделать обработку как у QUERY_STRING
|
||
}
|
||
|
||
$v = self::unescape($v, $is_hex2bin, false);
|
||
if ($v === false) return false;
|
||
parse_str($v, $GLOBALS[$k]);
|
||
|
||
$GLOBALS[$k] = self::convert_from($GLOBALS[$k], $charset);
|
||
if ($GLOBALS[$k] === false)
|
||
{
|
||
trigger_error('Array $' . $k . ' does not have keys/values in UTF-8 charset!', E_USER_WARNING);
|
||
return false;
|
||
}
|
||
|
||
$fixed = true;
|
||
}
|
||
if ($fixed)
|
||
{
|
||
$_REQUEST =
|
||
(isset($_COOKIE) ? $_COOKIE : array()) +
|
||
(isset($_POST) ? $_POST : array()) +
|
||
(isset($_GET) ? $_GET : array());
|
||
}
|
||
return true;
|
||
}
|
||
|
||
/**
|
||
* Calculates the height of the edit text in <textarea> html tag by value and width.
|
||
*
|
||
* В большинстве случаев будет корректно работать для моноширинных шрифтов.
|
||
* Т.к. браузер переносит последнее слово, которое не умещается на строке,
|
||
* на следующую строку, высота м.б. меньше ожидаемой.
|
||
* Этот алгоритм явл. простым (и быстрым) и не отслеживает переносы слов.
|
||
*
|
||
* @param string|null $s Текст
|
||
* @param int|digit $cols Ширина области редактирования (колонок)
|
||
* @param int|digit $min_rows Минимальное кол-во строк
|
||
* @param int|digit $max_rows Максимальное кол-во строк
|
||
* @return int|bool|null Number of rows (lines)
|
||
*/
|
||
public static function textarea_rows($s, $cols, $min_rows = 3, $max_rows = 32)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s)) return $s;
|
||
|
||
if (strlen($s) == 0) return $min_rows; #speed improve
|
||
$rows = 0;
|
||
#utf8_decode() converts characters that are not in ISO-8859-1 to '?'
|
||
foreach (preg_split('/\r\n|[\r\n]/sSX', utf8_decode($s)) as $line)
|
||
{
|
||
$rows += ceil((strlen($line) + 1) / $cols);
|
||
if ($rows > $max_rows) return $max_rows;
|
||
}
|
||
return ($rows < $min_rows) ? $min_rows : $rows;
|
||
}
|
||
|
||
/**
|
||
* @param string|null $s
|
||
* @param string|null $charlist
|
||
* @return string|bool|null
|
||
*/
|
||
public static function ltrim($s, $charlist = null)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s) || $s === '') return $s;
|
||
if ($charlist === null || self::is_ascii($charlist)) return ltrim($s);
|
||
return preg_replace('~^[' . self::_preg_quote_class($charlist, '~') . ']+~suSX', '', $s);
|
||
}
|
||
|
||
/**
|
||
* @param string|null $s
|
||
* @param string|null $charlist
|
||
* @return string|bool|null
|
||
*/
|
||
public static function rtrim($s, $charlist = null)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s) || $s === '') return $s;
|
||
if ($charlist === null || self::is_ascii($charlist)) return rtrim($s);
|
||
return preg_replace('~[' . self::_preg_quote_class($charlist, '~') . ']+$~suSX', '', $s);
|
||
}
|
||
|
||
/**
|
||
* @param scalar|null $s
|
||
* @param string|null $charlist
|
||
* @return scalar|null
|
||
*/
|
||
public static function trim($s, $charlist = null)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s) || $s === '') return $s;
|
||
if ($charlist === null || self::is_ascii($charlist)) return trim($s);
|
||
$charlist_re = self::_preg_quote_class($charlist, '~');
|
||
$s = preg_replace('~^[' . $charlist_re . ']+~suSX', '', $s);
|
||
return preg_replace('~[' . $charlist_re . ']+$~suSX', '', $s);
|
||
}
|
||
|
||
/**
|
||
* @param string $charlist
|
||
* @param string|null $delimiter
|
||
* @return string
|
||
*/
|
||
private static function _preg_quote_class($charlist, $delimiter = null)
|
||
{
|
||
#return preg_quote($charlist, $delimiter); #DEPRECATED
|
||
$quote_table = array(
|
||
'\\' => '\\\\',
|
||
'-' => '\-',
|
||
']' => '\]',
|
||
);
|
||
if (is_string($delimiter)) $quote_table[$delimiter] = '\\' . $delimiter;
|
||
return strtr($charlist, $quote_table);
|
||
}
|
||
|
||
/**
|
||
* @param string|null $s
|
||
* @param int|digit $length
|
||
* @param string $pad_str
|
||
* @param int $type STR_PAD_LEFT, STR_PAD_RIGHT or STR_PAD_BOTH
|
||
* @return string|bool|null
|
||
*/
|
||
public static function str_pad($s, $length, $pad_str = ' ', $type = STR_PAD_RIGHT)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s)) return $s;
|
||
|
||
$input_len = self::strlen($s);
|
||
if ($length <= $input_len) return $s;
|
||
|
||
$pad_str_len = self::strlen($pad_str);
|
||
$pad_len = $length - $input_len;
|
||
|
||
if ($type == STR_PAD_RIGHT)
|
||
{
|
||
$repeat_num = ceil($pad_len / $pad_str_len);
|
||
return self::substr($s . str_repeat($pad_str, $repeat_num), 0, $length);
|
||
}
|
||
|
||
if ($type == STR_PAD_LEFT)
|
||
{
|
||
$repeat_num = ceil($pad_len / $pad_str_len);
|
||
return self::substr(str_repeat($pad_str, $repeat_num), 0, intval(floor($pad_len))) . $s;
|
||
}
|
||
|
||
if ($type == STR_PAD_BOTH)
|
||
{
|
||
$pad_len /= 2;
|
||
$pad_amount_left = intval(floor($pad_len));
|
||
$pad_amount_right = intval(ceil($pad_len));
|
||
$repeat_times_left = ceil($pad_amount_left / $pad_str_len);
|
||
$repeat_times_right = ceil($pad_amount_right / $pad_str_len);
|
||
|
||
$padding_left = self::substr(str_repeat($pad_str, $repeat_times_left), 0, $pad_amount_left);
|
||
$padding_right = self::substr(str_repeat($pad_str, $repeat_times_right), 0, $pad_amount_right);
|
||
return $padding_left . $s . $padding_right;
|
||
}
|
||
|
||
trigger_error('Parameter 4 should be a constant of STR_PAD_RIGHT, STR_PAD_LEFT or STR_PAD_BOTH!', E_USER_WARNING);
|
||
return false;
|
||
}
|
||
|
||
/**
|
||
* @param string $str
|
||
* @param string $mask
|
||
* @param int|null $start
|
||
* @param int|null $length
|
||
* @return int|bool
|
||
*/
|
||
public static function strspn($str, $mask, $start = null, $length = null)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
#if (self::is_ascii($str) && self::is_ascii($mask)) return strspn($str, $mask, $start, $length);
|
||
if ($start !== null || $length !== null) $str = self::substr($str, $start, $length);
|
||
if (preg_match('~^[' . preg_quote($mask, '~') . ']+~uSX', $str, $m)) self::strlen($m[0]);
|
||
return 0;
|
||
}
|
||
|
||
/**
|
||
* Recode the text files in a specified folder in the UTF-8
|
||
* In the processing skipped binary files, files encoded in UTF-8, files that could not convert.
|
||
* So method works reliably enough.
|
||
*
|
||
*
|
||
* @param string $dir Директория для сканирования
|
||
* @param string|null $files_re Регул. выражение для шаблона имён файлов,
|
||
* например: '~\.(?:txt|sql|php|pl|py|sh|tpl|xml|xsl|html|xhtml|phtml|htm|js|json|css|conf|cfg|ini|htaccess)$~sSX'
|
||
* @param bool $is_recursive Обрабатывать вложенные папки и файлы?
|
||
* @param string $charset Исходная кодировка
|
||
* @param string|null $dirs_ignore_re Регул. выражение для исключения папок из обработки
|
||
* например: '~^(?:cache|images?|photos?|fonts?|img|ico|\.svn|\.hg|\.cvs)$~siSX'
|
||
* @param bool $is_echo Печать имён обработанных файлов и статус обработки в выходной поток?
|
||
* @param bool $is_simulate Сымитировать работу без реальной перезаписи файлов?
|
||
* @return int|bool Возвращает кол-во перекодированных файлов
|
||
* Returns FALSE if error occurred
|
||
*/
|
||
public static function convert_files_from(
|
||
$dir,
|
||
$files_re = null,
|
||
$is_recursive = true,
|
||
$charset = 'CP1251',
|
||
$dirs_ignore_re = null,
|
||
$is_echo = false,
|
||
$is_simulate = false)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
|
||
$dh = opendir($dir);
|
||
if (! is_resource($dh)) return false;
|
||
$counter = 0;
|
||
while (($name = readdir($dh)) !== false)
|
||
{
|
||
if ($name == '.' || $name == '..') continue;
|
||
$file = $dir . '/' . $name;
|
||
if (is_file($file))
|
||
{
|
||
if (is_string($files_re) && ! preg_match($files_re, $name)) continue;
|
||
if ($is_echo) echo $file;
|
||
|
||
$s = @file_get_contents($file);
|
||
if (! is_string($s))
|
||
{
|
||
if ($is_echo) echo ' Error to reading' . PHP_EOL;
|
||
return false;
|
||
}
|
||
|
||
if (self::is_utf8($s))
|
||
{
|
||
if ($is_echo) echo ' Already UTF-8, skipped' . PHP_EOL;
|
||
continue;
|
||
}
|
||
|
||
if (self::has_binary($s))
|
||
{
|
||
if ($is_echo) echo ' Вinary file, skipped' . PHP_EOL;
|
||
continue;
|
||
}
|
||
|
||
$s = self::convert_from($s, $charset);
|
||
if (! is_string($s) || ! self::is_utf8($s))
|
||
{
|
||
if ($is_echo) echo ' Error to converting (source file not in ' . $charset . '?)' . PHP_EOL;
|
||
continue;
|
||
}
|
||
|
||
$ext = strtolower(pathinfo($name, PATHINFO_EXTENSION));
|
||
if ($ext === 'htm' || $ext === 'html' || $ext === 'xhtml' || $ext === 'phtml' || $ext === 'tpl')
|
||
{
|
||
$s = preg_replace('~(<meta [\x00-\x20]++
|
||
(?: content="text/html; [\x00-\x20]++ charset= #HTML4
|
||
| charset=" #HTML5
|
||
)
|
||
) #1
|
||
[-a-z\d]++ #charset name
|
||
(" [^>]* >) #2
|
||
~sixSX', '$1utf-8$2', $s);
|
||
}
|
||
if ($ext === 'xml' || $ext === 'xsl' || $ext === 'tpl')
|
||
{
|
||
$s = preg_replace('~(<\?xml [\x00-\x20]++ encoding=") #1
|
||
[-a-z\d]++ #charset name
|
||
(" .*? \?>) #2
|
||
~sixSX', '$1utf-8$2', $s);
|
||
}
|
||
|
||
if (! $is_simulate)
|
||
{
|
||
$bytes = @file_put_contents($file, $s);
|
||
if ($bytes === false)
|
||
{
|
||
if ($is_echo) echo ' Error to writing' . PHP_EOL;
|
||
return false;
|
||
}
|
||
}
|
||
if ($is_echo) echo ' ' . $charset . ' to UTF-8 converted' . PHP_EOL;
|
||
$counter++;
|
||
}
|
||
elseif ($is_recursive && is_dir($file))
|
||
{
|
||
if (! is_string($dirs_ignore_re) || ! preg_match($dirs_ignore_re, $name))
|
||
{
|
||
$c = self::convert_files_from($file, $files_re, $is_recursive, $charset, $dirs_ignore_re, $is_echo, $is_simulate);
|
||
if ($c === false) return false;
|
||
$counter += $c;
|
||
}
|
||
}
|
||
}
|
||
closedir($dh);
|
||
return $counter;
|
||
}
|
||
|
||
/**
|
||
*
|
||
* @param int|string $low
|
||
* @param int|string $high
|
||
* @param int $step
|
||
* @return array|bool Returns FALSE if error occurred
|
||
*/
|
||
public static function range($low, $high, $step = 1)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (is_int($low) || is_int($high)) return range($low, $high, $step); #speed improve
|
||
$low_cp = self::ord($low);
|
||
$high_cp = self::ord($high);
|
||
if (! is_int($low_cp) || ! is_int($high_cp)) return false;
|
||
$a = range($low_cp, $high_cp, $step);
|
||
return array_map(array('self', 'chr'), $a);
|
||
}
|
||
|
||
/**
|
||
*
|
||
* @param string|null $s
|
||
* @param string|array $from
|
||
* @param string|null $to
|
||
* @return string|bool|null Returns FALSE if error occurred
|
||
*/
|
||
public static function strtr($s, $from, $to = null)
|
||
{
|
||
if (! ReflectionTypeHint::isValid()) return false;
|
||
if (! is_string($s) || $s === '') return $s;
|
||
if (is_array($from)) return strtr($s, $from); #speed improve
|
||
$keys = self::str_split($from);
|
||
$values = self::str_split($to);
|
||
if (! is_array($keys) || ! is_array($values)) return false;
|
||
$table = array_combine($keys, $values);
|
||
if (! is_array($table)) return false;
|
||
return strtr($s, $table);
|
||
}
|
||
|
||
public static function tests()
|
||
{
|
||
assert_options(ASSERT_ACTIVE, true);
|
||
assert_options(ASSERT_BAIL, true);
|
||
assert_options(ASSERT_WARNING, true);
|
||
assert_options(ASSERT_QUIET_EVAL, false);
|
||
$a = array(
|
||
'self::html_entity_decode(""&<>", true) === "\"&<>"',
|
||
'self::html_entity_decode(""&<>", false) === ""&<>"',
|
||
'self::html_entity_decode("&amp;", true) === "&"',
|
||
'self::html_entity_decode("&amp;", false) === "&amp;"',
|
||
'self::html_entity_decode(""", true) === "\""',
|
||
'self::html_entity_decode(""", false) === """',
|
||
'self::html_entity_decode("'", true) === "\'"',
|
||
'self::html_entity_decode("'", false) === "\'"',
|
||
'self::html_entity_decode(""", true) === "\""',
|
||
'self::html_entity_decode(""", false) === """',
|
||
|
||
'self::array_change_key_case(array("АБВГД" => "АБВГД"), CASE_LOWER) === array("абвгд" => "АБВГД")',
|
||
'self::array_change_key_case(array("абвгд" => "абвгд"), CASE_UPPER) === array("АБВГД" => "абвгд")',
|
||
|
||
'self::blocks_check("Яндекс", "Cyrillic") === true',
|
||
'self::blocks_check("Google", "Basic Latin") === true',
|
||
'self::blocks_check("Google & Яндекс", array("Basic Latin", "Cyrillic")) === true',
|
||
'self::blocks_check("Ё-моё, Yandex!", array(array(0x20, 0x7E), #[\x20-\x7E]
|
||
array(0x0410, 0x044F), #[A-Яa-я]
|
||
0x0401, #russian yo (Ё)
|
||
0x0451, #russian ye (ё)
|
||
)) === true',
|
||
|
||
'self::chunk_split("абвг", 2) === "аб\r\nвг"',
|
||
'self::chunk_split("абвг", 2, "|") === "аб|вг"',
|
||
|
||
'self::lowercase("1234-ABCD-АБВГ") === "1234-abcd-абвг"',
|
||
'self::lowercase(array("1234-ABCD-АБВГ" => "1234-ABCD-АБВГ")) === array("1234-ABCD-АБВГ" => "1234-abcd-абвг")',
|
||
'self::uppercase("1234-abcd-абвг") === "1234-ABCD-АБВГ"',
|
||
'self::uppercase(array("1234-abcd-абвг" => "1234-abcd-абвг")) === array("1234-abcd-абвг" => "1234-ABCD-АБВГ")',
|
||
|
||
'self::convert_from(self::convert_to("123-ABC-abc-АБВ-абв", $charset = "cp1251"), $charset = "cp1251") === "123-ABC-abc-АБВ-абв"',
|
||
|
||
'self::diactrical_remove("вдох\xc2\xadно\xc2\xadве\xcc\x81\xc2\xadние") === "вдох\xc2\xadно\xc2\xadве\xc2\xadние"',
|
||
'self::diactrical_remove("вдох\xc2\xadно\xc2\xadве\xcc\x81\xc2\xadние", array("\xc2\xad")) === "вдохновение"',
|
||
'self::diactrical_remove("вдох\xc2\xadно\xc2\xadве\xcc\x81\xc2\xadние", array("\xc2\xad"), true, $restore_table) === "вдохновение"',
|
||
'self::diactrical_restore("вдохновение", $restore_table) === "вдох\xc2\xadно\xc2\xadве\xcc\x81\xc2\xadние"',
|
||
|
||
'self::is_utf8(file_get_contents(' . var_export(__FILE__, true) . ', true)) === true',
|
||
'self::is_utf8(file_get_contents(' . var_export(__FILE__, true) . ', false)) === true',
|
||
'self::is_ascii(file_get_contents(' . var_export(__FILE__, true) . ')) === false',
|
||
'self::is_ascii("_\x01\x02абв", $error_char_offset) === false && $error_char_offset === 3',
|
||
'self::has_binary(file_get_contents(' . var_export(__FILE__, true) . ')) === false',
|
||
'self::has_binary("_аб\x01вг", $found_char_offset) === true && $found_char_offset === 3',
|
||
|
||
#range() uses ord() and chr()
|
||
'self::range("A", "D") === array("A", "B", "C", "D")',
|
||
'self::range("а", "г") === array("а", "б", "в", "г")',
|
||
'self::range(1, 3) === array(1, 2, 3)',
|
||
|
||
'"↔" === self::chr(self::ord("↔"))',
|
||
'"123-ABC-abc-АБВ-абв" === self::from_unicode(self::to_unicode("123-ABC-abc-АБВ-абв"))',
|
||
'self::strpos("123-ABC-abc-абв-АБВ-где", "АБВ") === 16',
|
||
'self::stripos("123-ABC-abc-абд-АБВ-где", "абв") === 16',
|
||
'self::strpos("123-ABC-abc", "АБВ") === false',
|
||
'self::strpos("123-АБВ-абв", "abc") === false',
|
||
|
||
'self::preg_quote_case_insensitive("123_слово_test") === "123_(с|С)(л|Л)(о|О)(в|В)(о|О)_[tT][eE][sS][tT]"',
|
||
'self::preg_quote_case_insensitive("123_test") === "(?i:123_test)"',
|
||
'self::preg_quote_case_insensitive("123") === "123"',
|
||
|
||
'self::unescape("%D1%82%D0%B5%D1%81%D1%82") === "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"',
|
||
'self::unescape("0xD182D0B5D181D182", true) === "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"',
|
||
'self::unescape("%u0442%u0435%u0441%u0442") === "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"',
|
||
'self::unescape("%u{442}%u{435}%u{0441}%u{00442}") === "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"',
|
||
'self::unescape("%u0025%u0032%u0035+%25%75%30%30%32%35") === "%25 %u0025"',
|
||
|
||
'self::ucfirst("!@#$", true) === "!@#$"',
|
||
'self::ucfirst("!@#$ test", true) === "!@#$ test"',
|
||
'self::ucfirst("«северный Поток»", true) === "«Северный поток»"',
|
||
'self::ucfirst("«северный Поток»", false) === "«Северный Поток»"',
|
||
|
||
//'self::strlen(file_get_contents(' . var_export(__FILE__, true) . ', true))'
|
||
);
|
||
foreach ($a as $k => $v) if (! assert($v)) return false;
|
||
|
||
//$start_time = microtime(true);
|
||
//$s = file_get_contents(__FILE__);
|
||
//for ($i = 0; $i < 10; $i++) $r = self::html_entity_encode($s);
|
||
//$time = microtime(true) - $start_time;
|
||
//d($time, $r);
|
||
|
||
return true;
|
||
}
|
||
|
||
}
|