= 5.3.x * * In Russian: * * Поддержка UTF-8 в PHP 5. * * Возможности и преимущества * * Совместимость с интерфейсом стандартных PHP функций, работающих с однобайтовыми кодировками * * Возможность работы без PHP расширений ICONV и MBSTRING, если они есть, то активно используются! * Используется наиболее быстрый из доступных методов между MBSTRING, ICONV, родной реализацией на PHP и хаками. * * Полезные функции, отсутствующие в ICONV и MBSTRING * * Методы, которые принимают и возвращают строку, умеют принимать и возвращать null. * Это удобно при выборках значений из базы данных. * * Несколько методов умеют обрабатывать массивы рекурсивно: * array_change_key_case(), convert_from(), convert_to(), strict(), is_utf8(), blocks_check(), convert_case(), lowercase(), uppercase(), unescape() * * Проверка у методов входных параметров на допустимые типы через рефлексию (можно отключить) * * Единый интерфейс и инкапсуляция, можно унаследоваться и переопределить методы * * Покрытие тестами * * PHP >= 5.3.x * * Example: * $s = 'Hello, Привет'; * if (UTF8::is_utf8($s)) echo UTF8::strlen($s); * * UTF-8 encoding scheme: * 2^7 0x00000000 — 0x0000007F 0xxxxxxx * 2^11 0x00000080 — 0x000007FF 110xxxxx 10xxxxxx * 2^16 0x00000800 — 0x0000FFFF 1110xxxx 10xxxxxx 10xxxxxx * 2^21 0x00010000 — 0x001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * 1-4 bytes length: 2^7 + 2^11 + 2^16 + 2^21 = 2 164 864 * * If I was a owner of the world, I would leave only 2 encoding: UTF-8 and UTF-32 ;-) * * Useful links * http://ru.wikipedia.org/wiki/UTF8 * http://www.madore.org/~david/misc/unitest/ A Unicode Test Page * http://www.unicode.org/ * http://www.unicode.org/reports/ * http://www.unicode.org/reports/tr10/ Unicode Collation Algorithm * http://www.unicode.org/Public/UCA/6.0.0/ Unicode Collation Algorithm * http://www.unicode.org/reports/tr6/ A Standard Compression Scheme for Unicode * http://www.fileformat.info/info/unicode/char/search.htm Unicode Character Search * * @link http://code.google.com/p/php5-utf8/ * @license http://creativecommons.org/licenses/by-sa/3.0/ * @author Nasibullin Rinat * @version 2.3.1 */ class UTF8 { /** * REPLACEMENT CHARACTER (for broken char) * * @var string */ const REPLACEMENT_CHAR = "\xEF\xBF\xBD"; #U+FFFD /** * Byte order mark, http://en.wikipedia.org/wiki/Byte_Order_Mark * * @var string */ const BOM = "\xEF\xBB\xBF"; /** * Regular expression for a character in UTF-8. * For engines, which don't support UTF8 mode. * In PCRE use a dot (".") and the flag /u, it works much faster! * * @var string */ const CHAR_RE = '[\x09\x0A\x0D\x20-\x7E] # ASCII strict # [\x00-\x7F] # ASCII non-strict (including control chars) | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 '; /** * Combining diactrical marks (Unicode 5.1). * \p{M} in PCRE terms. * For engines, which don't support UTF8 mode. * * For example, russian letters in composed form: "Ё" (U+0401), "Й" (U+0419), * decomposed form: (U+0415 U+0308), (U+0418 U+0306) * * @link http://www.unicode.org/charts/PDF/U0300.pdf * @link http://www.unicode.org/charts/PDF/U1DC0.pdf * @link http://www.unicode.org/charts/PDF/UFE20.pdf * @var string */ const DIACTRICAL_RE = ' \xcc[\x80-\xb9]|\xcd[\x80-\xaf] #UNICODE range: U+0300 — U+036F (for letters) | \xe2\x83[\x90-\xbf] #UNICODE range: U+20D0 — U+20FF (for symbols) | \xe1\xb7[\x80-\xbf] #UNICODE range: U+1DC0 — U+1DFF (supplement) | \xef\xb8[\xa0-\xaf] #UNICODE range: U+FE20 — U+FE2F (combining half marks) '; /** * \p{Lu} in PCRE terms. * For engines, which don't support UTF8 mode. * * @var string */ const CHAR_UPPER_RE = '[\x41-\x5a] | \xc3[\x80-\x9e] | \xc4[\x80-\xbf] | \xc5[\x81-\xbd] | \xc6[\x81-\xbc] | \xc7[\x85-\xbe] | \xc8[\x80-\xb2] | \xce[\x86-\xab] | \xcf[\x98-\xae] | \xd0[\x80-\xaf] | \xd1[\xa0-\xbe] | \xd2[\x80-\xbe] | \xd3[\x81-\xb8] | \xd4[\x80-\xbf] | \xd5[\x80-\x96] | \xe1[\xb8\xb9\xba][\x80-\xbe] | \xe1\xbb[\x80-\xb8] | \xe1\xbc[\x88-\xbf] | \xe1\xbd[\x88-\xaf] | \xe1[\xbe\xbf][\x88-\xbc] | \xef\xbc[\xa1-\xba] '; /** * \p{Ll} in PCRE terms. * For engines, which don't support UTF8 mode. * * @var string */ const CHAR_LOWER_RE = '[\x61-\x7a] | \xc2\xb5 | \xc3[\xa0-\xbf] | \xc4[\x81-\xbe] | \xc5[\x80-\xbe] | \xc6[\x83-\xbf] | \xc7[\x86-\xbf] | \xc8[\x81-\xb3] | \xc9[\x93-\xb5] | \xca[\x80-\x92] | \xce[\xac-\xbf] | \xcf[\x80-\xaf] | \xd0[\xb0-\xbf] | \xd1[\x80-\xbf] | \xd2[\x81-\xbf] | \xd3[\x82-\xb9] | \xd4[\x81-\x8f] | \xd5[\xa1-\xbf] | \xd6[\x80-\x86] | \xe1[\xb8\xb9\xba][\x81-\xbf] | \xe1\xbb[\x81-\xb9] | \xe1\xbc[\x80-\xb7] | \xe1\xbd[\x80-\xbd] | \xe1\xbe[\x80-\xb3] | \xe1\xbf[\x83-\xb3] | \xef\xbd[\x81-\x9a] '; /** * HTML entities, examples: > Ö ˜ " * * @var string */ const HTML_ENTITY_RE = '&(?> [a-zA-Z][a-zA-Z\d]++ | \#(?> \d{1,4}+ | x[\da-fA-F]{2,4}+ ) ); '; /** * Quotation marks. * For engines, which don't support UTF8 mode. * * @var string */ const QUOTATION_MARK_RE = '\x22|\xc2[\xab\xbb]|\xe2\x80[\x98\x99\x9a\x9c\x9d\x9e\xb9\xba]'; /** * * @var array */ public static $html_quotation_mark_table = array( '"' => "\x22", #U+0022 ["] " quotation mark = APL quote '«' => "\xc2\xab", #U+00AB [«] left-pointing double angle quotation mark = left pointing guillemet '»' => "\xc2\xbb", #U+00BB [»] right-pointing double angle quotation mark = right pointing guillemet ''' => "\xe2\x80\x98", #U+2018 ['] left single quotation mark ''' => "\xe2\x80\x99", #U+2019 ['] right single quotation mark (and apostrophe!) 'Latin Extended-B' => array( 0 => 0x0180, 1 => 0x024F, 2 => 3, ), 'IPA Extensions' => array( 0 => 0x0250, 1 => 0x02AF, 2 => 4, ), 'Spacing Modifier Letters' => array( 0 => 0x02B0, 1 => 0x02FF, 2 => 5, ), 'Combining Diacritical Marks' => array( 0 => 0x0300, 1 => 0x036F, 2 => 6, ), 'Greek and Coptic' => array( 0 => 0x0370, 1 => 0x03FF, 2 => 7, ), 'Cyrillic' => array( 0 => 0x0400, 1 => 0x04FF, 2 => 8, ), 'Cyrillic Supplement' => array( 0 => 0x0500, 1 => 0x052F, 2 => 9, ), 'Armenian' => array( 0 => 0x0530, 1 => 0x058F, 2 => 10, ), 'Hebrew' => array( 0 => 0x0590, 1 => 0x05FF, 2 => 11, ), 'Arabic' => array( 0 => 0x0600, 1 => 0x06FF, 2 => 12, ), 'Syriac' => array( 0 => 0x0700, 1 => 0x074F, 2 => 13, ), 'Arabic Supplement' => array( 0 => 0x0750, 1 => 0x077F, 2 => 14, ), 'Thaana' => array( 0 => 0x0780, 1 => 0x07BF, 2 => 15, ), 'NKo' => array( 0 => 0x07C0, 1 => 0x07FF, 2 => 16, ), 'Samaritan' => array( 0 => 0x0800, 1 => 0x083F, 2 => 17, ), 'Mandaic' => array( 0 => 0x0840, 1 => 0x085F, 2 => 18, ), 'Devanagari' => array( 0 => 0x0900, 1 => 0x097F, 2 => 19, ), 'Bengali' => array( 0 => 0x0980, 1 => 0x09FF, 2 => 20, ), 'Gurmukhi' => array( 0 => 0x0A00, 1 => 0x0A7F, 2 => 21, ), 'Gujarati' => array( 0 => 0x0A80, 1 => 0x0AFF, 2 => 22, ), 'Oriya' => array( 0 => 0x0B00, 1 => 0x0B7F, 2 => 23, ), 'Tamil' => array( 0 => 0x0B80, 1 => 0x0BFF, 2 => 24, ), 'Telugu' => array( 0 => 0x0C00, 1 => 0x0C7F, 2 => 25, ), 'Kannada' => array( 0 => 0x0C80, 1 => 0x0CFF, 2 => 26, ), 'Malayalam' => array( 0 => 0x0D00, 1 => 0x0D7F, 2 => 27, ), 'Sinhala' => array( 0 => 0x0D80, 1 => 0x0DFF, 2 => 28, ), 'Thai' => array( 0 => 0x0E00, 1 => 0x0E7F, 2 => 29, ), 'Lao' => array( 0 => 0x0E80, 1 => 0x0EFF, 2 => 30, ), 'Tibetan' => array( 0 => 0x0F00, 1 => 0x0FFF, 2 => 31, ), 'Myanmar' => array( 0 => 0x1000, 1 => 0x109F, 2 => 32, ), 'Georgian' => array( 0 => 0x10A0, 1 => 0x10FF, 2 => 33, ), 'Hangul Jamo' => array( 0 => 0x1100, 1 => 0x11FF, 2 => 34, ), 'Ethiopic' => array( 0 => 0x1200, 1 => 0x137F, 2 => 35, ), 'Ethiopic Supplement' => array( 0 => 0x1380, 1 => 0x139F, 2 => 36, ), 'Cherokee' => array( 0 => 0x13A0, 1 => 0x13FF, 2 => 37, ), 'Unified Canadian Aboriginal Syllabics' => array( 0 => 0x1400, 1 => 0x167F, 2 => 38, ), 'Ogham' => array( 0 => 0x1680, 1 => 0x169F, 2 => 39, ), 'Runic' => array( 0 => 0x16A0, 1 => 0x16FF, 2 => 40, ), 'Tagalog' => array( 0 => 0x1700, 1 => 0x171F, 2 => 41, ), 'Hanunoo' => array( 0 => 0x1720, 1 => 0x173F, 2 => 42, ), 'Buhid' => array( 0 => 0x1740, 1 => 0x175F, 2 => 43, ), 'Tagbanwa' => array( 0 => 0x1760, 1 => 0x177F, 2 => 44, ), 'Khmer' => array( 0 => 0x1780, 1 => 0x17FF, 2 => 45, ), 'Mongolian' => array( 0 => 0x1800, 1 => 0x18AF, 2 => 46, ), 'Unified Canadian Aboriginal Syllabics Extended' => array( 0 => 0x18B0, 1 => 0x18FF, 2 => 47, ), 'Limbu' => array( 0 => 0x1900, 1 => 0x194F, 2 => 48, ), 'Tai Le' => array( 0 => 0x1950, 1 => 0x197F, 2 => 49, ), 'New Tai Lue' => array( 0 => 0x1980, 1 => 0x19DF, 2 => 50, ), 'Khmer Symbols' => array( 0 => 0x19E0, 1 => 0x19FF, 2 => 51, ), 'Buginese' => array( 0 => 0x1A00, 1 => 0x1A1F, 2 => 52, ), 'Tai Tham' => array( 0 => 0x1A20, 1 => 0x1AAF, 2 => 53, ), 'Balinese' => array( 0 => 0x1B00, 1 => 0x1B7F, 2 => 54, ), 'Sundanese' => array( 0 => 0x1B80, 1 => 0x1BBF, 2 => 55, ), 'Batak' => array( 0 => 0x1BC0, 1 => 0x1BFF, 2 => 56, ), 'Lepcha' => array( 0 => 0x1C00, 1 => 0x1C4F, 2 => 57, ), 'Ol Chiki' => array( 0 => 0x1C50, 1 => 0x1C7F, 2 => 58, ), 'Vedic Extensions' => array( 0 => 0x1CD0, 1 => 0x1CFF, 2 => 59, ), 'Phonetic Extensions' => array( 0 => 0x1D00, 1 => 0x1D7F, 2 => 60, ), 'Phonetic Extensions Supplement' => array( 0 => 0x1D80, 1 => 0x1DBF, 2 => 61, ), 'Combining Diacritical Marks Supplement' => array( 0 => 0x1DC0, 1 => 0x1DFF, 2 => 62, ), 'Latin Extended Additional' => array( 0 => 0x1E00, 1 => 0x1EFF, 2 => 63, ), 'Greek Extended' => array( 0 => 0x1F00, 1 => 0x1FFF, 2 => 64, ), 'General Punctuation' => array( 0 => 0x2000, 1 => 0x206F, 2 => 65, ), 'Superscripts and Subscripts' => array( 0 => 0x2070, 1 => 0x209F, 2 => 66, ), 'Currency Symbols' => array( 0 => 0x20A0, 1 => 0x20CF, 2 => 67, ), 'Combining Diacritical Marks for Symbols' => array( 0 => 0x20D0, 1 => 0x20FF, 2 => 68, ), 'Letterlike Symbols' => array( 0 => 0x2100, 1 => 0x214F, 2 => 69, ), 'Number Forms' => array( 0 => 0x2150, 1 => 0x218F, 2 => 70, ), 'Arrows' => array( 0 => 0x2190, 1 => 0x21FF, 2 => 71, ), 'Mathematical Operators' => array( 0 => 0x2200, 1 => 0x22FF, 2 => 72, ), 'Miscellaneous Technical' => array( 0 => 0x2300, 1 => 0x23FF, 2 => 73, ), 'Control Pictures' => array( 0 => 0x2400, 1 => 0x243F, 2 => 74, ), 'Optical Character Recognition' => array( 0 => 0x2440, 1 => 0x245F, 2 => 75, ), 'Enclosed Alphanumerics' => array( 0 => 0x2460, 1 => 0x24FF, 2 => 76, ), 'Box Drawing' => array( 0 => 0x2500, 1 => 0x257F, 2 => 77, ), 'Block Elements' => array( 0 => 0x2580, 1 => 0x259F, 2 => 78, ), 'Geometric Shapes' => array( 0 => 0x25A0, 1 => 0x25FF, 2 => 79, ), 'Miscellaneous Symbols' => array( 0 => 0x2600, 1 => 0x26FF, 2 => 80, ), 'Dingbats' => array( 0 => 0x2700, 1 => 0x27BF, 2 => 81, ), 'Miscellaneous Mathematical Symbols-A' => array( 0 => 0x27C0, 1 => 0x27EF, 2 => 82, ), 'Supplemental Arrows-A' => array( 0 => 0x27F0, 1 => 0x27FF, 2 => 83, ), 'Braille Patterns' => array( 0 => 0x2800, 1 => 0x28FF, 2 => 84, ), 'Supplemental Arrows-B' => array( 0 => 0x2900, 1 => 0x297F, 2 => 85, ), 'Miscellaneous Mathematical Symbols-B' => array( 0 => 0x2980, 1 => 0x29FF, 2 => 86, ), 'Supplemental Mathematical Operators' => array( 0 => 0x2A00, 1 => 0x2AFF, 2 => 87, ), 'Miscellaneous Symbols and Arrows' => array( 0 => 0x2B00, 1 => 0x2BFF, 2 => 88, ), 'Glagolitic' => array( 0 => 0x2C00, 1 => 0x2C5F, 2 => 89, ), 'Latin Extended-C' => array( 0 => 0x2C60, 1 => 0x2C7F, 2 => 90, ), 'Coptic' => array( 0 => 0x2C80, 1 => 0x2CFF, 2 => 91, ), 'Georgian Supplement' => array( 0 => 0x2D00, 1 => 0x2D2F, 2 => 92, ), 'Tifinagh' => array( 0 => 0x2D30, 1 => 0x2D7F, 2 => 93, ), 'Ethiopic Extended' => array( 0 => 0x2D80, 1 => 0x2DDF, 2 => 94, ), 'Cyrillic Extended-A' => array( 0 => 0x2DE0, 1 => 0x2DFF, 2 => 95, ), 'Supplemental Punctuation' => array( 0 => 0x2E00, 1 => 0x2E7F, 2 => 96, ), 'CJK Radicals Supplement' => array( 0 => 0x2E80, 1 => 0x2EFF, 2 => 97, ), 'Kangxi Radicals' => array( 0 => 0x2F00, 1 => 0x2FDF, 2 => 98, ), 'Ideographic Description Characters' => array( 0 => 0x2FF0, 1 => 0x2FFF, 2 => 99, ), 'CJK Symbols and Punctuation' => array( 0 => 0x3000, 1 => 0x303F, 2 => 100, ), 'Hiragana' => array( 0 => 0x3040, 1 => 0x309F, 2 => 101, ), 'Katakana' => array( 0 => 0x30A0, 1 => 0x30FF, 2 => 102, ), 'Bopomofo' => array( 0 => 0x3100, 1 => 0x312F, 2 => 103, ), 'Hangul Compatibility Jamo' => array( 0 => 0x3130, 1 => 0x318F, 2 => 104, ), 'Kanbun' => array( 0 => 0x3190, 1 => 0x319F, 2 => 105, ), 'Bopomofo Extended' => array( 0 => 0x31A0, 1 => 0x31BF, 2 => 106, ), 'CJK Strokes' => array( 0 => 0x31C0, 1 => 0x31EF, 2 => 107, ), 'Katakana Phonetic Extensions' => array( 0 => 0x31F0, 1 => 0x31FF, 2 => 108, ), 'Enclosed CJK Letters and Months' => array( 0 => 0x3200, 1 => 0x32FF, 2 => 109, ), 'CJK Compatibility' => array( 0 => 0x3300, 1 => 0x33FF, 2 => 110, ), 'CJK Unified Ideographs Extension A' => array( 0 => 0x3400, 1 => 0x4DBF, 2 => 111, ), 'Yijing Hexagram Symbols' => array( 0 => 0x4DC0, 1 => 0x4DFF, 2 => 112, ), 'CJK Unified Ideographs' => array( 0 => 0x4E00, 1 => 0x9FFF, 2 => 113, ), 'Yi Syllables' => array( 0 => 0xA000, 1 => 0xA48F, 2 => 114, ), 'Yi Radicals' => array( 0 => 0xA490, 1 => 0xA4CF, 2 => 115, ), 'Lisu' => array( 0 => 0xA4D0, 1 => 0xA4FF, 2 => 116, ), 'Vai' => array( 0 => 0xA500, 1 => 0xA63F, 2 => 117, ), 'Cyrillic Extended-B' => array( 0 => 0xA640, 1 => 0xA69F, 2 => 118, ), 'Bamum' => array( 0 => 0xA6A0, 1 => 0xA6FF, 2 => 119, ), 'Modifier Tone Letters' => array( 0 => 0xA700, 1 => 0xA71F, 2 => 120, ), 'Latin Extended-D' => array( 0 => 0xA720, 1 => 0xA7FF, 2 => 121, ), 'Syloti Nagri' => array( 0 => 0xA800, 1 => 0xA82F, 2 => 122, ), 'Common Indic Number Forms' => array( 0 => 0xA830, 1 => 0xA83F, 2 => 123, ), 'Phags-pa' => array( 0 => 0xA840, 1 => 0xA87F, 2 => 124, ), 'Saurashtra' => array( 0 => 0xA880, 1 => 0xA8DF, 2 => 125, ), 'Devanagari Extended' => array( 0 => 0xA8E0, 1 => 0xA8FF, 2 => 126, ), 'Kayah Li' => array( 0 => 0xA900, 1 => 0xA92F, 2 => 127, ), 'Rejang' => array( 0 => 0xA930, 1 => 0xA95F, 2 => 128, ), 'Hangul Jamo Extended-A' => array( 0 => 0xA960, 1 => 0xA97F, 2 => 129, ), 'Javanese' => array( 0 => 0xA980, 1 => 0xA9DF, 2 => 130, ), 'Cham' => array( 0 => 0xAA00, 1 => 0xAA5F, 2 => 131, ), 'Myanmar Extended-A' => array( 0 => 0xAA60, 1 => 0xAA7F, 2 => 132, ), 'Tai Viet' => array( 0 => 0xAA80, 1 => 0xAADF, 2 => 133, ), 'Ethiopic Extended-A' => array( 0 => 0xAB00, 1 => 0xAB2F, 2 => 134, ), 'Meetei Mayek' => array( 0 => 0xABC0, 1 => 0xABFF, 2 => 135, ), 'Hangul Syllables' => array( 0 => 0xAC00, 1 => 0xD7AF, 2 => 136, ), 'Hangul Jamo Extended-B' => array( 0 => 0xD7B0, 1 => 0xD7FF, 2 => 137, ), 'High Surrogates' => array( 0 => 0xD800, 1 => 0xDB7F, 2 => 138, ), 'High Private Use Surrogates' => array( 0 => 0xDB80, 1 => 0xDBFF, 2 => 139, ), 'Low Surrogates' => array( 0 => 0xDC00, 1 => 0xDFFF, 2 => 140, ), 'Private Use Area' => array( 0 => 0xE000, 1 => 0xF8FF, 2 => 141, ), 'CJK Compatibility Ideographs' => array( 0 => 0xF900, 1 => 0xFAFF, 2 => 142, ), 'Alphabetic Presentation Forms' => array( 0 => 0xFB00, 1 => 0xFB4F, 2 => 143, ), 'Arabic Presentation Forms-A' => array( 0 => 0xFB50, 1 => 0xFDFF, 2 => 144, ), 'Variation Selectors' => array( 0 => 0xFE00, 1 => 0xFE0F, 2 => 145, ), 'Vertical Forms' => array( 0 => 0xFE10, 1 => 0xFE1F, 2 => 146, ), 'Combining Half Marks' => array( 0 => 0xFE20, 1 => 0xFE2F, 2 => 147, ), 'CJK Compatibility Forms' => array( 0 => 0xFE30, 1 => 0xFE4F, 2 => 148, ), 'Small Form Variants' => array( 0 => 0xFE50, 1 => 0xFE6F, 2 => 149, ), 'Arabic Presentation Forms-B' => array( 0 => 0xFE70, 1 => 0xFEFF, 2 => 150, ), 'Halfwidth and Fullwidth Forms' => array( 0 => 0xFF00, 1 => 0xFFEF, 2 => 151, ), 'Specials' => array( 0 => 0xFFF0, 1 => 0xFFFF, 2 => 152, ), 'Linear B Syllabary' => array( 0 => 0x10000, 1 => 0x1007F, 2 => 153, ), 'Linear B Ideograms' => array( 0 => 0x10080, 1 => 0x100FF, 2 => 154, ), 'Aegean Numbers' => array( 0 => 0x10100, 1 => 0x1013F, 2 => 155, ), 'Ancient Greek Numbers' => array( 0 => 0x10140, 1 => 0x1018F, 2 => 156, ), 'Ancient Symbols' => array( 0 => 0x10190, 1 => 0x101CF, 2 => 157, ), 'Phaistos Disc' => array( 0 => 0x101D0, 1 => 0x101FF, 2 => 158, ), 'Lycian' => array( 0 => 0x10280, 1 => 0x1029F, 2 => 159, ), 'Carian' => array( 0 => 0x102A0, 1 => 0x102DF, 2 => 160, ), 'Old Italic' => array( 0 => 0x10300, 1 => 0x1032F, 2 => 161, ), 'Gothic' => array( 0 => 0x10330, 1 => 0x1034F, 2 => 162, ), 'Ugaritic' => array( 0 => 0x10380, 1 => 0x1039F, 2 => 163, ), 'Old Persian' => array( 0 => 0x103A0, 1 => 0x103DF, 2 => 164, ), 'Deseret' => array( 0 => 0x10400, 1 => 0x1044F, 2 => 165, ), 'Shavian' => array( 0 => 0x10450, 1 => 0x1047F, 2 => 166, ), 'Osmanya' => array( 0 => 0x10480, 1 => 0x104AF, 2 => 167, ), 'Cypriot Syllabary' => array( 0 => 0x10800, 1 => 0x1083F, 2 => 168, ), 'Imperial Aramaic' => array( 0 => 0x10840, 1 => 0x1085F, 2 => 169, ), 'Phoenician' => array( 0 => 0x10900, 1 => 0x1091F, 2 => 170, ), 'Lydian' => array( 0 => 0x10920, 1 => 0x1093F, 2 => 171, ), 'Kharoshthi' => array( 0 => 0x10A00, 1 => 0x10A5F, 2 => 172, ), 'Old South Arabian' => array( 0 => 0x10A60, 1 => 0x10A7F, 2 => 173, ), 'Avestan' => array( 0 => 0x10B00, 1 => 0x10B3F, 2 => 174, ), 'Inscriptional Parthian' => array( 0 => 0x10B40, 1 => 0x10B5F, 2 => 175, ), 'Inscriptional Pahlavi' => array( 0 => 0x10B60, 1 => 0x10B7F, 2 => 176, ), 'Old Turkic' => array( 0 => 0x10C00, 1 => 0x10C4F, 2 => 177, ), 'Rumi Numeral Symbols' => array( 0 => 0x10E60, 1 => 0x10E7F, 2 => 178, ), 'Brahmi' => array( 0 => 0x11000, 1 => 0x1107F, 2 => 179, ), 'Kaithi' => array( 0 => 0x11080, 1 => 0x110CF, 2 => 180, ), 'Cuneiform' => array( 0 => 0x12000, 1 => 0x123FF, 2 => 181, ), 'Cuneiform Numbers and Punctuation' => array( 0 => 0x12400, 1 => 0x1247F, 2 => 182, ), 'Egyptian Hieroglyphs' => array( 0 => 0x13000, 1 => 0x1342F, 2 => 183, ), 'Bamum Supplement' => array( 0 => 0x16800, 1 => 0x16A3F, 2 => 184, ), 'Kana Supplement' => array( 0 => 0x1B000, 1 => 0x1B0FF, 2 => 185, ), 'Byzantine Musical Symbols' => array( 0 => 0x1D000, 1 => 0x1D0FF, 2 => 186, ), 'Musical Symbols' => array( 0 => 0x1D100, 1 => 0x1D1FF, 2 => 187, ), 'Ancient Greek Musical Notation' => array( 0 => 0x1D200, 1 => 0x1D24F, 2 => 188, ), 'Tai Xuan Jing Symbols' => array( 0 => 0x1D300, 1 => 0x1D35F, 2 => 189, ), 'Counting Rod Numerals' => array( 0 => 0x1D360, 1 => 0x1D37F, 2 => 190, ), 'Mathematical Alphanumeric Symbols' => array( 0 => 0x1D400, 1 => 0x1D7FF, 2 => 191, ), 'Mahjong Tiles' => array( 0 => 0x1F000, 1 => 0x1F02F, 2 => 192, ), 'Domino Tiles' => array( 0 => 0x1F030, 1 => 0x1F09F, 2 => 193, ), 'Playing Cards' => array( 0 => 0x1F0A0, 1 => 0x1F0FF, 2 => 194, ), 'Enclosed Alphanumeric Supplement' => array( 0 => 0x1F100, 1 => 0x1F1FF, 2 => 195, ), 'Enclosed Ideographic Supplement' => array( 0 => 0x1F200, 1 => 0x1F2FF, 2 => 196, ), 'Miscellaneous Symbols And Pictographs' => array( 0 => 0x1F300, 1 => 0x1F5FF, 2 => 197, ), 'Emoticons' => array( 0 => 0x1F600, 1 => 0x1F64F, 2 => 198, ), 'Transport And Map Symbols' => array( 0 => 0x1F680, 1 => 0x1F6FF, 2 => 199, ), 'Alchemical Symbols' => array( 0 => 0x1F700, 1 => 0x1F77F, 2 => 200, ), 'CJK Unified Ideographs Extension B' => array( 0 => 0x20000, 1 => 0x2A6DF, 2 => 201, ), 'CJK Unified Ideographs Extension C' => array( 0 => 0x2A700, 1 => 0x2B73F, 2 => 202, ), 'CJK Unified Ideographs Extension D' => array( 0 => 0x2B740, 1 => 0x2B81F, 2 => 203, ), 'CJK Compatibility Ideographs Supplement' => array( 0 => 0x2F800, 1 => 0x2FA1F, 2 => 204, ), 'Tags' => array( 0 => 0xE0000, 1 => 0xE007F, 2 => 205, ), 'Variation Selectors Supplement' => array( 0 => 0xE0100, 1 => 0xE01EF, 2 => 206, ), 'Supplementary Private Use Area-A' => array( 0 => 0xF0000, 1 => 0xFFFFF, 2 => 207, ), 'Supplementary Private Use Area-B' => array( 0 => 0x100000, 1 => 0x10FFFF, 2 => 208, ), ); #calling the methods of this class only statically! private function __construct() {} /** * Remove combining diactrical marks, with possibility of the restore * Удаляет диакритические знаки в тексте, с возможностью восстановления (опция) * * @param string|null $s * @param array|null $additional_chars for example: "\xc2\xad" #soft hyphen = discretionary hyphen * @param bool $is_can_restored * @param array|null &$restore_table * @return string|bool|null Returns FALSE if error occurred */ public static function diactrical_remove($s, $additional_chars = null, $is_can_restored = false, &$restore_table = null) { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($s) || $s === '') return $s; if ($additional_chars) { foreach ($additional_chars as $k => &$v) $v = preg_quote($v, '/'); $re = '/((?>' . self::DIACTRICAL_RE . '|' . implode('|', $additional_chars) . ')+)/sxSX'; } else $re = '/((?>' . self::DIACTRICAL_RE . ')+)/sxSX'; if (! $is_can_restored) return preg_replace($re, '', $s); $restore_table = array(); $a = preg_split($re, $s, -1, PREG_SPLIT_DELIM_CAPTURE); $c = count($a); if ($c === 1) return $s; $pos = 0; $s2 = ''; for ($i = 0; $i < $c - 1; $i += 2) { $s2 .= $a[$i]; #запоминаем символьные (не байтовые!) позиции $pos += self::strlen($a[$i]); $restore_table['offsets'][$pos] = $a[$i + 1]; } $restore_table['length'] = $pos + self::strlen(end($a)); return $s2 . end($a); } /** * Restore combining diactrical marks, removed by self::diactrical_remove() * In Russian: * Восстанавливает диакритические знаки в тексте, при условии, что их символьные позиции и кол-во символов не изменились! * * @see self::diactrical_remove() * @param string|null $s * @param array $restore_table * @return string|bool|null Returns FALSE if error occurred (broken $restore_table) */ public static function diactrical_restore($s, array $restore_table) { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($s) || $s === '') return $s; if (! $restore_table) return $s; if (! is_int(@$restore_table['length']) || ! is_array(@$restore_table['offsets']) || $restore_table['length'] !== self::strlen($s)) return false; $a = array(); $length = $offset = 0; $s2 = ''; foreach ($restore_table['offsets'] as $pos => $diactricals) { $length = $pos - $offset; $s2 .= self::substr($s, $offset, $length) . $diactricals; $offset = $pos; } return $s2 . self::substr($s, $offset, strlen($s)); } /** * Encodes data from another character encoding to UTF-8. * * @param array|scalar|null $data * @param string $charset * @return array|scalar|null Returns FALSE if error occurred */ public static function convert_from($data, $charset = 'cp1251') { if (! ReflectionTypeHint::isValid()) return false; $charset = strtoupper($charset); return self::_convert($data, $charset, 'UTF-8'); } /** * Encodes data from UTF-8 to another character encoding. * * @param array|scalar|null $data * @param string $charset * @return array|scalar|null Returns FALSE if error occurred */ public static function convert_to($data, $charset = 'cp1251') { if (! ReflectionTypeHint::isValid()) return false; $charset = strtoupper($charset); return self::_convert($data, 'UTF-8', $charset); } /** * Recoding the data of any structure to/from UTF-8. * Arrays traversed recursively, recoded keys and values. * * @see mb_encoding_aliases() * @param array|scalar|null $data * @param string $charset_from * @param string $charset_to * @return array|scalar|null Returns FALSE if error occurred */ private static function _convert($data, $charset_from, $charset_to) { if (! ReflectionTypeHint::isValid()) return false; #for recursive calls if ($charset_from === $charset_to) return $data; #speed improve if (is_array($data)) { $d = array(); foreach ($data as $k => &$v) { if (is_string($k)) { $k = self::_convert($k, $charset_from, $charset_to); if (! is_string($k)) return false; } $d[$k] = self::_convert($v, $charset_from, $charset_to); if ($d[$k] === false && ! is_bool($v)) return false; } return $d; } if (is_string($data)) { #smart behaviour for errors protected + speed improve if ($charset_from === 'UTF-8' && ! self::is_utf8($data)) return $data; if ($charset_to === 'UTF-8' && self::is_utf8($data)) return $data; #since PHP-5.3.x iconv() faster then mb_convert_encoding() if (function_exists('iconv')) return iconv($charset_from, $charset_to . '//IGNORE//TRANSLIT', $data); if (function_exists('mb_convert_encoding')) return mb_convert_encoding($data, $charset_to, $charset_from); #charset_from if ($charset_from === 'ISO-8859-1') return utf8_encode($data); if ($charset_from === 'UTF-16' || $charset_from === 'UCS-2') return self::_convert_from_utf16($data); if ($charset_from === 'CP1251' || $charset_from === 'CP1259') return strtr($data, self::$cp1259_table); if ($charset_from === 'KOI8-R') return strtr(convert_cyr_string($data, 'k', 'w'), self::$cp1259_table); if ($charset_from === 'ISO-8859-5') return strtr(convert_cyr_string($data, 'i', 'w'), self::$cp1259_table); if ($charset_from === 'CP866') return strtr(convert_cyr_string($data, 'a', 'w'), self::$cp1259_table); if ($charset_from === 'MAC-CYRILLIC') return strtr(convert_cyr_string($data, 'm', 'w'), self::$cp1259_table); #charset_to if ($charset_to === 'ISO-8859-1') return utf8_decode($data); if ($charset_to === 'CP1251' || $charset_to === 'CP1259') return strtr($data, array_flip(self::$cp1259_table)); #last trying if (function_exists('recode_string')) { $s = @recode_string($charset_from . '..' . $charset_to, $data); if (is_string($s)) return $s; } trigger_error('Convert "' . $charset_from . '" --> "' . $charset_to . '" is not supported native, "iconv" or "mbstring" extension required', E_USER_WARNING); return false; } if (is_scalar($data) || is_null($data)) return $data; #~ null, integer, float, boolean return false; #object or resource } /** * Convert UTF-16 / UCS-2 encoding string to UTF-8. * Surrogates UTF-16 are supported! * * In Russian: * Преобразует строку из кодировки UTF-16 / UCS-2 в UTF-8. * Суррогаты UTF-16 поддерживаются! * * @param string $s * @param string $type 'BE' -- big endian byte order * 'LE' -- little endian byte order * @param bool $to_array returns array chars instead whole string? * @return string|array|bool UTF-8 string, array chars or FALSE if error occurred */ private static function _convert_from_utf16($s, $type = 'BE', $to_array = false) { static $types = array( 'BE' => 'n', #unsigned short (always 16 bit, big endian byte order) 'LE' => 'v', #unsigned short (always 16 bit, little endian byte order) ); if (! array_key_exists($type, $types)) { trigger_error('Unexpected value in 2-nd parameter, "' . $type . '" given!', E_USER_WARNING); return false; } #the fastest way: if (function_exists('iconv') || function_exists('mb_convert_encoding')) { if (function_exists('iconv')) $s = iconv('UTF-16' . $type, 'UTF-8', $s); elseif (function_exists('mb_convert_encoding')) $s = mb_convert_encoding($s, 'UTF-8', 'UTF-16' . $type); if (! $to_array) return $s; return self::str_split($s); } /* http://en.wikipedia.org/wiki/UTF-16 The improvement that UTF-16 made over UCS-2 is its ability to encode characters in planes 1-16, not just those in plane 0 (BMP). UTF-16 represents non-BMP characters (those from U+10000 through U+10FFFF) using a pair of 16-bit words, known as a surrogate pair. First 1000016 is subtracted from the code point to give a 20-bit value. This is then split into two separate 10-bit values each of which is represented as a surrogate with the most significant half placed in the first surrogate. To allow safe use of simple word-oriented string processing, separate ranges of values are used for the two surrogates: 0xD800-0xDBFF for the first, most significant surrogate and 0xDC00-0xDFFF for the second, least significant surrogate. For example, the character at code point U+10000 becomes the code unit sequence 0xD800 0xDC00, and the character at U+10FFFD, the upper limit of Unicode, becomes the sequence 0xDBFF 0xDFFD. Unicode and ISO/IEC 10646 do not, and will never, assign characters to any of the code points in the U+D800-U+DFFF range, so an individual code value from a surrogate pair does not ever represent a character. http://www.russellcottrell.com/greek/utilities/SurrogatePairCalculator.htm http://www.russellcottrell.com/greek/utilities/UnicodeRanges.htm Conversion of a Unicode scalar value S to a surrogate pair : H = Math.floor((S - 0x10000) / 0x400) + 0xD800; L = ((S - 0x10000) % 0x400) + 0xDC00; The conversion of a surrogate pair to a scalar value: N = ((H - 0xD800) * 0x400) + (L - 0xDC00) + 0x10000; */ $a = array(); $hi = false; foreach (unpack($types[$type] . '*', $s) as $codepoint) { #surrogate process if ($hi !== false) { $lo = $codepoint; if ($lo < 0xDC00 || $lo > 0xDFFF) $a[] = "\xEF\xBF\xBD"; #U+FFFD REPLACEMENT CHARACTER (for broken char) else { $codepoint = (($hi - 0xD800) * 0x400) + ($lo - 0xDC00) + 0x10000; $a[] = self::chr($codepoint); } $hi = false; } elseif ($codepoint < 0xD800 || $codepoint > 0xDBFF) $a[] = self::chr($codepoint); #not surrogate else $hi = $codepoint; #surrogate was found } return $to_array ? $a : implode('', $a); } /** * Strips out device control codes in the ASCII range. * * @param array|scalar|null Data to clean * @return array|scalar|null Returns FALSE if error occurred */ public static function strict($data) { if (! ReflectionTypeHint::isValid()) return false; if (is_array($data)) { $d = array(); foreach ($data as $k => &$v) { if (is_string($k)) { $k = self::strict($k); if (! is_string($k)) return false; } $d[$k] = self::strict($v); if ($d[$k] === false && ! is_bool($v)) return false; } return $d; } if (is_string($data)) return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F]+/sSX', '', $data); if (is_scalar($data) || is_null($data)) return $data; #int/float/bool/null return false; #object or resource } /** * Check the data accessory to the class of control characters in ASCII. * For non string always returns FALSE. * * @param scalar|null $data * @param int|null $found_char_offset Returns the offset for the first found binary symbol * @return bool */ public static function has_binary($data, &$found_char_offset = null) { if (! ReflectionTypeHint::isValid()) return false; #[\t\n\r] = [\x09\x0a\x0d] #[\x00-\x1f\x7f](? &$v) { if (! self::is_utf8($k, $is_strict) || ! self::is_utf8($v, $is_strict)) return false; } return true; } return false; #object or resource } /** * Tries to detect if a string is in Unicode encoding * * @deprecated Slowly, use self::is_utf8() instead * @see self::is_utf8() * @param string $s текст * @param bool $is_strict строгая проверка диапазона ASCII? * @return bool */ public static function check($s, $is_strict = true) { if (! ReflectionTypeHint::isValid()) return false; for ($i = 0, $len = strlen($s); $i < $len; $i++) { $c = ord($s[$i]); if ($c < 0x80) #1 byte 0bbbbbbb { if ($is_strict === false || ($c > 0x1F && $c < 0x7F) || $c == 0x09 || $c == 0x0A || $c == 0x0D) continue; } if (($c & 0xE0) == 0xC0) $n = 1; #2 bytes 110bbbbb 10bbbbbb elseif (($c & 0xF0) == 0xE0) $n = 2; #3 bytes 1110bbbb 10bbbbbb 10bbbbbb elseif (($c & 0xF8) == 0xF0) $n = 3; #4 bytes 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb elseif (($c & 0xFC) == 0xF8) $n = 4; #5 bytes 111110bb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb elseif (($c & 0xFE) == 0xFC) $n = 5; #6 bytes 1111110b 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb else return false; #does not match any model #n bytes matching 10bbbbbb follow ? for ($j = 0; $j < $n; $j++) { $i++; if ($i == $len || ((ord($s[$i]) & 0xC0) != 0x80) ) return false; } } return true; } /** * Check the data in UTF-8 charset on given ranges of the standard UNICODE. * The suitable alternative to regular expressions. * * For null, integer, float, boolean returns TRUE. * * Arrays traversed recursively (keys and values). * At least if one array element value is not passed checking, it returns FALSE. * * @example * #A simple check the standard named ranges: * UTF8::blocks_check('поисковые системы Google и Yandex', array('Basic Latin', 'Cyrillic')); * #You can check the named, direct ranges or codepoints together: * UTF8::blocks_check('поисковые системы Google и Yandex', array(array(0x20, 0x7E), #[\x20-\x7E] * array(0x0410, 0x044F), #[A-Яa-я] * 0x0401, #russian yo (Ё) * 0x0451, #russian ye (ё) * 'Arrows', * )); * * @link http://www.unicode.org/charts/ * @param array|scalar|null $data * @param array|string $blocks * @return bool Возвращает TRUE, если все символы из текста принадлежат указанным диапазонам * и FALSE в противном случае или для разбитого UTF-8. */ public static function blocks_check($data, $blocks) { if (! ReflectionTypeHint::isValid()) return false; if (is_array($data)) { foreach ($data as $k => &$v) { if (! self::blocks_check($k, $blocks) || ! self::blocks_check($v, $blocks)) return false; } return true; } if (is_int($data)) $data = strval($data); elseif (is_float($data)) $data = str_replace(',', '.', strval($data)); elseif (! is_string($data)) return false; $chars = self::str_split($data); if ($chars === false) return false; #broken UTF-8 unset($data); #memory free $skip = array(); #save to cache already checked symbols foreach ($chars as $i => $char) { if (array_key_exists($char, $skip)) continue; #speed improve $codepoint = self::ord($char); if (! is_int($codepoint)) return false; #broken UTF-8? $is_valid = false; $blocks = (array)$blocks; foreach ($blocks as $j => $block) { if (is_string($block)) { if (! array_key_exists($block, self::$unicode_blocks)) { trigger_error('Unknown block "' . $block . '"!', E_USER_WARNING); return false; } list ($min, $max) = self::$unicode_blocks[$block]; } elseif (is_array($block)) list ($min, $max) = $block; elseif (is_int($block)) $min = $max = $block; else trigger_error('A string/array/int type expected for block[' . $j . ']!', E_USER_ERROR); if ($codepoint >= $min && $codepoint <= $max) { $is_valid = true; break; } } if (! $is_valid) return false; $skip[$char] = null; } return true; } /** * Сравнение строк * * @param string|null $s1 * @param string|null $s2 * @param string $locale For example, 'en_CA', 'ru_RU' * @return int|bool|null Returns FALSE if error occurred * Returns < 0 if $s1 is less than $s2; * > 0 if $s1 is greater than $s2; * 0 if they are equal. */ public static function strcmp($s1, $s2, $locale = '') { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($s1) || ! is_string($s2)) return null; if (! function_exists('collator_create')) return strcmp($s1, $s2); # PHP 5 >= 5.3.0, PECL intl >= 1.0.0 # If empty string ("") or "root" are passed, UCA rules will be used. $c = new Collator($locale); if (! $c) { # Returns an "empty" object on error. You can use intl_get_error_code() and/or intl_get_error_message() to know what happened. trigger_error(intl_get_error_message(), E_USER_WARNING); return false; } return $c->compare($s1, $s2); } /** * Сравнение строк для N первых символов * * @param string|null $s1 * @param string|null $s2 * @param int $length * @return int|bool|null Returns FALSE if error occurred * Returns < 0 if $s1 is less than $s2; * > 0 if $s1 is greater than $s2; * 0 if they are equal. */ public static function strncmp($s1, $s2, $length) { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($s1) || ! is_string($s2)) return null; return self::strcmp(self::substr($s1, 0, $length), self::substr($s2, 0, $length)); } /** * Implementation strcasecmp() function for UTF-8 encoding string. * * @param string|null $s1 * @param string|null $s2 * @return int|bool|null Returns FALSE if error occurred * Returns < 0 if $s1 is less than $s2; * > 0 if $s1 is greater than $s2; * 0 if they are equal. */ public static function strcasecmp($s1, $s2) { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($s1) || ! is_string($s2)) return null; return self::strcmp(self::lowercase($s1), self::lowercase($s2)); } /** * Converts a UTF-8 string to a UNICODE codepoints * * @param string|null $s UTF-8 string * @return array|bool|null Unicode codepoints * Returns FALSE if $s broken (not UTF-8) */ public static function to_unicode($s) { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($s) || $s === '') return $s; $s2 = null; #since PHP-5.3.x iconv() little faster then mb_convert_encoding() if (function_exists('iconv')) $s2 = @iconv('UTF-8', 'UCS-4BE', $s); elseif (function_exists('mb_convert_encoding')) $s2 = @mb_convert_encoding($s, 'UCS-4BE', 'UTF-8'); if (is_string($s2)) return array_values(unpack('N*', $s2)); if ($s2 !== null) return false; $a = self::str_split($s); if (! is_array($a)) return false; return array_map(array(__CLASS__, 'ord'), $a); } /** * Converts a UNICODE codepoints to a UTF-8 string * * @param array|null $a Unicode codepoints * @return string|bool|null UTF-8 string * Returns FALSE if error occurred */ public static function from_unicode($a) { if (! ReflectionTypeHint::isValid()) return false; if (! is_array($a)) return $a; #since PHP-5.3.x iconv() little faster then mb_convert_encoding() if (function_exists('iconv')) { array_walk($a, function(&$cp) { $cp = pack('N', $cp); }); $s = @iconv('UCS-4BE', 'UTF-8', implode('', $a)); if (! is_string($s)) return false; return $s; } if (function_exists('mb_convert_encoding')) { array_walk($a, function(&$cp) { $cp = pack('N', $cp); }); $s = mb_convert_encoding(implode('', $a), 'UTF-8', 'UCS-4BE'); if (! is_string($s)) return false; return $s; } return implode('', array_map(array(__CLASS__, 'chr'), $a)); } /** * Converts a UTF-8 character to a UNICODE codepoint * * @param string|null $char UTF-8 character * @return int|bool|null Unicode codepoint * Returns FALSE if $char broken (not UTF-8) */ public static function ord($char) { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($char)) return $char; static $cache = array(); if (array_key_exists($char, $cache)) return $cache[$char]; #speed improve switch (strlen($char)) { case 1 : return $cache[$char] = ord($char); case 2 : return $cache[$char] = (ord($char{1}) & 63) | ((ord($char{0}) & 31) << 6); case 3 : return $cache[$char] = (ord($char{2}) & 63) | ((ord($char{1}) & 63) << 6) | ((ord($char{0}) & 15) << 12); case 4 : return $cache[$char] = (ord($char{3}) & 63) | ((ord($char{2}) & 63) << 6) | ((ord($char{1}) & 63) << 12) | ((ord($char{0}) & 7) << 18); default : trigger_error('Character 0x' . bin2hex($char) . ' is not UTF-8!', E_USER_WARNING); return false; } } /** * Converts a UNICODE codepoint to a UTF-8 character * * @param int|digit|null $cp Unicode codepoint * @return string|bool|null UTF-8 character * Returns FALSE if error occurred */ public static function chr($cp) { if (! ReflectionTypeHint::isValid()) return false; if (! is_int($cp) && ! ctype_digit($cp)) return $cp; static $cache = array(); if (array_key_exists($cp, $cache)) return $cache[$cp]; #speed improve if ($cp <= 0x7f) return $cache[$cp] = chr($cp); if ($cp <= 0x7ff) return $cache[$cp] = chr(0xc0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3f)); if ($cp <= 0xffff) return $cache[$cp] = chr(0xe0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3f)) . chr(0x80 | ($cp & 0x3f)); if ($cp <= 0x10ffff) return $cache[$cp] = chr(0xf0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3f)) . chr(0x80 | (($cp >> 6) & 0x3f)) . chr(0x80 | ($cp & 0x3f)); #U+FFFD REPLACEMENT CHARACTER return $cache[$cp] = "\xEF\xBF\xBD"; } /** * Implementation chunk_split() function for UTF-8 encoding string. * * @param string|null $s * @param int|digit|null $length * @param string|null $glue * @return string|bool|null Returns FALSE if error occurred */ public static function chunk_split($s, $length = null, $glue = null) { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($s) || $s === '') return $s; $length = intval($length); $glue = strval($glue); if ($length < 1) $length = 76; if ($glue === '') $glue = "\r\n"; $a = self::str_split($s, $length); if (! is_array($a)) return false; return implode($glue, $a); } /** * Changes all keys in an array * * @param array|null $a * @param int $mode {CASE_LOWER|CASE_UPPER} * @param bool $is_recursive * @return array|bool|null Returns FALSE if error occurred */ public static function array_change_key_case($a, $mode, $is_recursive = false) { if (! ReflectionTypeHint::isValid()) return false; if (! is_array($a)) return $a; $a2 = array(); foreach ($a as $k => $v) { if (is_string($k)) { $k = self::convert_case($k, $mode); if ($k === false) return false; } if ($is_recursive && is_array($v)) #recursive support { $v = self::array_change_key_case($v, $mode, $is_recursive); if (! is_array($v)) return false; } $a2[$k] = $v; } return $a2; } /** * Конвертирует регистр букв в данных в кодировке UTF-8. * Массивы обходятся рекурсивно, при этом конвертируются только значения * в элементах массива, а ключи остаются без изменений. * Для конвертирования только ключей используйте метод self::array_change_key_case(). * * @see self::array_change_key_case() * @link http://www.unicode.org/charts/PDF/U0400.pdf * @link http://ru.wikipedia.org/wiki/ISO_639-1 * @param array|scalar|null $data Данные произвольной структуры * @param int $mode {CASE_LOWER|CASE_UPPER} * @param bool $is_ascii_optimization for speed improve * @return scalar|bool|null Returns FALSE if error occurred */ public static function convert_case($data, $mode, $is_ascii_optimization = true) { if (! ReflectionTypeHint::isValid()) return false; if (is_array($data)) #recursive support { foreach ($data as $k => $v) { $data[$k] = self::convert_case($v, $mode); if ($data[$k] === false && ! is_bool($v)) return false; } return $data; } if (! is_string($data) || ! $data) return $data; if ($mode === CASE_UPPER) { if ($is_ascii_optimization && self::is_ascii($data)) return strtoupper($data); #speed improve! #deprecated, since PHP-5.3.x strtr() 2-3 times faster then mb_strtolower() #if (function_exists('mb_strtoupper')) return mb_strtoupper($data, 'utf-8'); return strtr($data, array_flip(self::$convert_case_table)); } if ($mode === CASE_LOWER) { if ($is_ascii_optimization && self::is_ascii($data)) return strtolower($data); #speed improve! #deprecated, since PHP-5.3.x strtr() 2-3 times faster then mb_strtolower() #if (function_exists('mb_strtolower')) return mb_strtolower($data, 'utf-8'); return strtr($data, self::$convert_case_table); } trigger_error('Parameter 2 should be a constant of CASE_LOWER or CASE_UPPER!', E_USER_WARNING); return $data; } /** * Convert a data to lower case * * @param array|scalar|null $data * @return scalar|bool|null Returns FALSE if error occurred */ public static function lowercase($data) { if (! ReflectionTypeHint::isValid()) return false; return self::convert_case($data, CASE_LOWER); } /** * Convert a data to upper case * * @param array|scalar|null $data * @return scalar|null Returns FALSE if error occurred */ public static function uppercase($data) { if (! ReflectionTypeHint::isValid()) return false; return self::convert_case($data, CASE_UPPER); } /** * Convert a data to lower case * * @param array|scalar|null $data * @return scalar|bool|null Returns FALSE if error occurred */ public static function strtolower($data) { if (! ReflectionTypeHint::isValid()) return false; return self::convert_case($data, CASE_LOWER); } /** * Convert a data to upper case * * @param array|scalar|null $data * @return scalar|null Returns FALSE if error occurred */ public static function strtoupper($data) { if (! ReflectionTypeHint::isValid()) return false; return self::convert_case($data, CASE_UPPER); } /** * Convert all HTML entities to native UTF-8 characters * Функция декодирует гораздо больше именованных сущностей, чем стандартная html_entity_decode() * Все dec и hex сущности так же переводятся в UTF-8. * * Example: '"' or '"' or '"' will be converted to '"'. * * @link http://www.htmlhelp.com/reference/html40/entities/ * @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References) * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true * * @param scalar|null $s * @param bool $is_special_chars Дополнительно обрабатывать специальные html сущности? (< > & " ') * @return scalar|null Returns FALSE if error occurred */ public static function html_entity_decode($s, $is_special_chars = false) { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($s) || $s === '') return $s; #speed improve if (strlen($s) < 4 #по минимальной длине сущности - 4 байта: &#d; &xx; || ($pos = strpos($s, '&') === false) || strpos($s, ';', $pos) === false) return $s; $table = self::$html_entity_table; if ($is_special_chars) { $table += self::$html_special_chars_table + array( #' entity is only available in XHTML/HTML5 and not in plain HTML, see http://www.w3.org/TR/xhtml1/#C_16 ''' => "\x27", #U+0027 ['] ' apostrophe ); } #replace named entities $s = strtr($s, $table); #block below deprecated, since PHP-5.3.x strtr() 1.5 times faster if (0 && preg_match_all('/&[a-zA-Z]++\d*+;/sSX', $s, $m, null, $pos)) { foreach (array_unique($m[0]) as $entity) { if (array_key_exists($entity, $table)) $s = str_replace($entity, $table[$entity], $s); } } #заменяем числовые dec и hex сущности: if (strpos($s, '&#') !== false) #speed improve { $class = __CLASS__; $html_special_chars_table_flipped = array_flip(self::$html_special_chars_table); $s = preg_replace_callback('/&#((x)[\da-fA-F]{1,6}+|\d{1,7}+);/sSX', function (array $m) use ($class, $html_special_chars_table_flipped, $is_special_chars) { $codepoint = isset($m[2]) && $m[2] === 'x' ? hexdec($m[1]) : $m[1]; if (! $is_special_chars) { $char = pack('C', $codepoint); if (array_key_exists($char, $html_special_chars_table_flipped)) return $html_special_chars_table_flipped[$char]; } return $class::chr($codepoint); }, $s); } return $s; } /** * Convert special UTF-8 characters to HTML entities. * Функция кодирует гораздо больше именованных сущностей, чем стандартная htmlentities() * * @link http://www.htmlhelp.com/reference/html40/entities/ * @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References) * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true * * @param scalar|null $s * @param bool $is_special_chars_only Обрабатывать только специальные html сущности? (< > & ") * @return scalar|null Returns FALSE if error occurred */ public static function html_entity_encode($s, $is_special_chars_only = false) { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($s) || $s === '') return $s; if ($is_special_chars_only) return strtr($s, array_flip(self::$html_special_chars_table)); #binary support #if ($is_special_chars_only) return htmlspecialchars($s); #DEPRECATED, charset dependent #replace UTF-8 chars to named entities: $s = strtr($s, array_flip(self::$html_entity_table)); #block below deprecated, since PHP-5.3.x strtr() 3 times faster if (0 && preg_match_all('~(?> [\xc2\xc3\xc5\xc6\xcb\xce\xcf][\x80-\xbf] #2 bytes | \xe2[\x80-\x99][\x82-\xac] #3 bytes ) ~sxSX', $s, $m)) { $table = array_flip(self::$html_entity_table); foreach (array_unique($m[0]) as $char) { if (array_key_exists($char, $table)) $s = str_replace($char, $table[$char], $s); } } return $s; } /** * Make regular expression for case insensitive match * Example (only digits): "123" => "123" * Example (only ASCII): "123_test" => "(?i:123_test)" * Example (upper ASCII): "123_слово_test" => "123_(с|С)(л|Л)(о|О)(в|В)(о|О)_[tT][eE][sS][tT]" * * @param string|null $s * @param string|null $delimiter If the optional delimiter is specified, it will also be escaped. * This is useful for escaping the delimiter that is required by the PCRE functions. * The / is the most commonly used delimiter. * @return string|bool|null Returns FALSE if error occurred */ public static function preg_quote_case_insensitive($s, $delimiter = null) { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($s) || $s === '') return $s; if (ctype_digit($s)) return preg_quote($s, $delimiter); #speed improve if (self::is_ascii($s)) return '(?i:' . preg_quote($s, $delimiter) . ')'; #speed improve $s_lc = self::convert_case($s, CASE_LOWER, false); if ($s_lc === false) return false; $s_uc = self::convert_case($s, CASE_UPPER, false); if ($s_uc === false) return false; if ($s_lc === $s_uc) return preg_quote($s, $delimiter); #speed improve $chars_lc = self::str_split($s_lc); if ($chars_lc === false) return false; $chars_uc = self::str_split($s_uc); if ($chars_uc === false) return false; $s_re = ''; foreach ($chars_lc as $i => $char) { if ($chars_lc[$i] === $chars_uc[$i]) $s_re .= preg_quote($chars_lc[$i], $delimiter); elseif (strlen($chars_lc[$i]) === 1 /*self::is_ascii($chars_lc[$i])*/) $s_re .= '[' . self::_preg_quote_class($chars_lc[$i] . $chars_uc[$i], $delimiter) . ']'; else #для русских и др. букв, т. к. флаг /u и (?i:слово) не помогают :( $s_re .= '(' . preg_quote($chars_lc[$i], $delimiter) . '|' . preg_quote($chars_uc[$i], $delimiter) . ')'; } return $s_re; } /** * Call preg_match_all() and convert byte offsets into character offsets for PREG_OFFSET_CAPTURE flag. * This is regardless of whether you use /u modifier. * * @link http://bolknote.ru/2010/09/08/~2704 * * @param string $pattern * @param string|null $subject * @param array $matches * @param int $flags * @param int $char_offset * @return array|bool|null Returns FALSE if error occurred */ public static function preg_match_all($pattern, $subject, &$matches, $flags = PREG_PATTERN_ORDER, $char_offset = 0) { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($subject)) return $subject; $byte_offset = ($char_offset > 0) ? strlen(self::substr($subject, 0, $char_offset)) : $char_offset; $return = preg_match_all($pattern, $subject, $matches, $flags, $byte_offset); if ($return === false) return false; if ($flags & PREG_OFFSET_CAPTURE) { foreach ($matches as &$match) { foreach ($match as &$a) $a[1] = self::strlen(substr($subject, 0, $a[1])); } } return $return; } #alias for self::str_limit() public static function truncate($s, $maxlength = null, $continue = "\xe2\x80\xa6", &$is_cutted = null, $tail_min_length = 20) { return self::str_limit($s, $maxlength, $continue, $is_cutted, $tail_min_length); } /** * Обрезает текст в кодировке UTF-8 до заданной длины, * причём последнее слово показывается целиком, а не обрывается на середине. * Html сущности корректно обрабатываются. * * @param string|null $s Текст в кодировке UTF-8 * @param int|null|digit $maxlength Ограничение длины текста * @param string $continue Завершающая строка, которая будет вставлена после текста, если он обрежется * @param bool|null &$is_cutted Текст был обрезан? * @param int|digit $tail_min_length Если длина "хвоста", оставшегося после обрезки текста, меньше $tail_min_length, * то текст возвращается без изменений * @return string|bool|null Returns FALSE if error occurred */ public static function str_limit($s, $maxlength = null, $continue = "\xe2\x80\xa6", &$is_cutted = null, $tail_min_length = 20) #"\xe2\x80\xa6" = "…" { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($s) || $s === '') return $s; $is_cutted = false; if ($continue === null) $continue = "\xe2\x80\xa6"; if (! $maxlength) $maxlength = 256; #speed improve block #{{{ if (strlen($s) <= $maxlength) return $s; $s2 = str_replace("\r\n", '?', $s); $s2 = preg_replace('~' . self::HTML_ENTITY_RE . '~sxSX', '?', $s2); if (strlen($s2) <= $maxlength || self::strlen($s2) <= $maxlength) return $s; #}}} $r = preg_match_all('~(?> \r\n # next line | ' . self::HTML_ENTITY_RE . ' | . ) ~sxuSX', $s, $m); if ($r === false) return false; #d($m); if (count($m[0]) <= $maxlength) return $s; $left = implode('', array_slice($m[0], 0, $maxlength)); #из диапазона ASCII исключаем буквы, цифры, открывающие парные символы [a-zA-Z\d\(\{\[] и некоторые др. символы #нельзя вырезать в конце строки символ ";", т.к. он используются в сущностях &xxx; $left2 = rtrim($left, "\x00..\x28\x2A..\x2F\x3A\x3C..\x3E\x40\x5B\x5C\x5E..\x60\x7B\x7C\x7E\x7F"); if (strlen($left) !== strlen($left2)) $return = $left2 . $continue; else { #добавляем остаток к обрезанному слову $right = implode('', array_slice($m[0], $maxlength)); preg_match('/^(?> #цифры, закрывающие парные символы, дефис для составных слов, дата, время, IP-адреса, URL типа www.ya.ru:80! [\d\)\]\}\-\.:]+ #letters | \p{L}+ #quotation marks | [' . implode('', self::$html_quotation_mark_table) . ']+ )+ /suxSX', $right, $m); #d($m); $right = isset($m[0]) ? rtrim($m[0], '.-') : ''; $return = $left . $right; if (strlen($return) !== strlen($s)) $return .= $continue; } if (self::strlen($s) - self::strlen($return) < $tail_min_length) return $s; $is_cutted = true; return $return; } /** * Implementation str_split() function for UTF-8 encoding string. * * @param string|null $s * @param int|null|digit $length * @return array|bool|null Returns FALSE if error occurred */ public static function str_split($s, $length = null) { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($s)) return $s; $length = ($length === null) ? 1 : intval($length); if ($length < 1) return false; #there are limits in regexp for {min,max}! if (preg_match_all('~.~suSX', $s, $m) === false) return false; if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return false; if ($length === 1) $a = $m[0]; else { $a = array(); for ($i = 0, $c = count($m[0]); $i < $c; $i += $length) $a[] = implode('', array_slice($m[0], $i, $length)); } return $a; } /** * Implementation strlen() function for UTF-8 encoding string. * * @param string|null $s * @return int|bool|null Returns FALSE if error occurred */ public static function strlen($s) { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($s)) return $s; //since PHP-5.3.x mb_strlen() faster then strlen(utf8_decode()) if (function_exists('mb_strlen')) return mb_strlen($s, 'utf-8'); /* utf8_decode() converts characters that are not in ISO-8859-1 to '?', which, for the purpose of counting, is quite alright. It's much faster than iconv_strlen() Note: this function does not count bad UTF-8 bytes in the string - these are simply ignored */ return strlen(utf8_decode($s)); /* #iconv_strlen() slowly then strlen(utf8_decode()) if (function_exists('iconv_strlen')) return iconv_strlen($s, 'utf-8'); #Do not count UTF-8 continuation bytes #return strlen(preg_replace('/[\x80-\xBF]/sSX', '', $s)); #slowly then strlen(utf8_decode()) preg_match_all('~.~suSX', $str, $m); return count($m[0]); #slowly then preg_match_all() + count() $n = 0; for ($i = 0, $len = strlen($s); $i < $len; $i++) { $c = ord(substr($s, $i, 1)); if ($c < 0x80) $n++; #single-byte (0xxxxxx) elseif (($c & 0xC0) == 0xC0) $n++; #multi-byte starting byte (11xxxxxx) } return $n; */ } /** * Implementation strpos() function for UTF-8 encoding string * * @param string|null $s The entire string * @param string|int $needle The searched substring * @param int|null $offset The optional offset parameter specifies the position from which the search should be performed * @return int|bool|null Returns the numeric position of the first occurrence of needle in haystack. * If needle is not found, will return FALSE. */ public static function strpos($s, $needle, $offset = null) { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($s)) return $s; if ($offset === null || $offset < 0) $offset = 0; #mb_strpos() faster then iconv_strpos() if (function_exists('mb_strpos')) return mb_strpos($s, $needle, $offset, 'utf-8'); #iconv_strpos() deprecated, because slowly than self::strlen(substr()) #if (function_exists('iconv_strpos')) return iconv_strpos($s, $needle, $offset, 'utf-8'); $byte_pos = $offset; do if (($byte_pos = strpos($s, $needle, $byte_pos)) === false) return false; while (($char_pos = self::strlen(substr($s, 0, $byte_pos++))) < $offset); return $char_pos; } /** * Find position of first occurrence of a case-insensitive string. * * @param string|null $s The entire string * @param string|int $needle The searched substring * @param int|null $offset The optional offset parameter specifies the position from which the search should be performed * @return int|bool|null Returns the numeric position of the first occurrence of needle in haystack. * If needle is not found, will return FALSE. */ public static function stripos($s, $needle, $offset = null) { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($s)) return $s; if ($offset === null || $offset < 0) $offset = 0; if (function_exists('mb_stripos')) return mb_stripos($s, $needle, $offset, 'utf-8'); #optimization block (speed improve) #{{{ $ascii_int = intval(self::is_ascii($s)) + intval(self::is_ascii($needle)); if ($ascii_int === 1) return false; if ($ascii_int === 2) return stripos($s, $needle, $offset); #}}} $s = self::convert_case($s, CASE_LOWER, false); if ($s === false) return false; $needle = self::convert_case($needle, CASE_LOWER, false); if ($needle === false) return false; return self::strpos($s, $needle, $offset); } /** * Implementation strrev() function for UTF-8 encoding string * * @param string|null $s * @return string|bool|null Returns FALSE if error occurred */ public static function strrev($s) { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($s) || $s === '') return $s; if (0) #TODO test speed { $s = self::_convert($s, 'UTF-8', 'UTF-32'); if (! is_string($s)) return false; $s = implode('', array_reverse(str_split($s, 4))); return self::_convert($s, 'UTF-32', 'UTF-8'); } if (! is_array($a = self::str_split($s))) return false; return implode('', array_reverse($a)); } /** * Implementation substr() function for UTF-8 encoding string. * * @link http://www.w3.org/International/questions/qa-forms-utf-8.html * @param string|null $s * @param int|digit $offset * @param int|null|digit $length * @return string|bool|null Returns FALSE if error occurred */ public static function substr($s, $offset, $length = null) { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($s)) return $s; #since PHP-5.3.x mb_substr() faster then iconv_substr() if (function_exists('mb_substr')) { if ($length === null) $length = self::strlen($s); return mb_substr($s, $offset, $length, 'utf-8'); } if (function_exists('iconv_substr')) { if ($length === null) $length = self::strlen($s); return iconv_substr($s, $offset, $length, 'utf-8'); } static $_s = null; static $_a = null; if ($_s !== $s) $_a = self::str_split($_s = $s); if (! is_array($_a)) return false; if ($length !== null) $a = array_slice($_a, $offset, $length); else $a = array_slice($_a, $offset); return implode('', $a); } /** * Implementation substr_replace() function for UTF-8 encoding string. * * @param string|null $s * @param string|int $replacement * @param int|digit $start * @param int|null $length * @return string|bool|null Returns FALSE if error occurred */ public static function substr_replace($s, $replacement, $start, $length = null) { if (! ReflectionTypeHint::isValid()) return false; if (! is_string($s) || $s === '') return $s; $a = self::str_split($s); if (! is_array($a)) return false; array_splice($a, $start, $length, $replacement); return implode('', $a); } /** * Implementation ucfirst() function for UTF-8 encoding string. * Преобразует первый символ строки в кодировке UTF-8 в верхний регистр. * Корректно обрабатывает слова в кавычках, например: «северный поток» --> «Северный поток» * * @param string|null $s * @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр? * @return string|bool|null Returns FALSE if error occurred */ public static function ucfirst($s, $is_other_to_lowercase = true) { if (! ReflectionTypeHint::isValid()) return false; if ($s === '' || ! is_string($s)) return $s; if (! preg_match('/^([' . implode('', self::$html_quotation_mark_table) . ']{1,2}+) #1 quotation marks (\p{L}) #2 first letter (.*+) #3 next letters $/sxuSX', $s, $m)) return $s; #letters not found return $m[1] . self::uppercase($m[2]) . ($is_other_to_lowercase ? self::lowercase($m[3]) : $m[3]); } /** * Implementation ucwords() function for UTF-8 encoding string. * Преобразует в верхний регистр первый символ каждого слова в строке в кодировке UTF-8, * остальные символы каждого слова преобразуются в нижний регистр. * * @param string|null $s * @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр? * @param string $spaces_re * @return string|bool|null Returns FALSE if error occurred */ public static function ucwords($s, $is_other_to_lowercase = true, $spaces_re = '~([\p{Z}\s]+)~suSX') { if (! ReflectionTypeHint::isValid()) return false; if ($s === '' || ! is_string($s)) return $s; $words = preg_split($spaces_re, $s, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); foreach ($words as $k => $word) { $words[$k] = self::ucfirst($word, $is_other_to_lowercase); if ($words[$k] === false) return false; } return implode('', $words); } /** * Decodes a string to UTF-8 string from some formats (can be mixed) * Examples * '%D1%82%D0%B5%D1%81%D1%82' => "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" #binary (regular) * '0xD182D0B5D181D182' => "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" #binary (compact) * '%u0442%u0435%u0441%u0442' => "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" #UCS-2 (U+0 — U+FFFF) * '%u{442}%u{435}%u{0441}%u{00442}' => "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" #UTF-8 (U+0 — U+FFFFFF) * * It is used to decode the data in the format %uXXXX, encoded deprecated * javascript's function encode(). Recommended to use encodeURIComponent(). * Obsolete format %uXXXX allows unicode only in the range of UCS-2, ie, U+0 to U+FFFF. * * @see urldecode() * @param array|scalar|null $data * @param bool $is_hex2bin Decode the HEX-data? * Example: '0xD182D0B5D181D182' => "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" * Hint: parameters in the URL address is sometimes * convenient to encode not function rawurlencode($string), * and use the following mechanism (encoded data is more compact): * '0x' . bin2hex($string) * @param bool $is_urldecode * @return array|scalar|null Returns FALSE if error occurred */ public static function unescape($data, $is_hex2bin = false, $is_urldecode = true) { if (! ReflectionTypeHint::isValid()) return false; if (is_array($data)) { $d = array(); foreach ($data as $k => &$v) { if (is_string($k)) { $k = self::unescape($k, $is_hex2bin, $is_urldecode); if (! is_string($k)) return false; } $d[$k] = self::unescape($v, $is_hex2bin, $is_urldecode); if ($d[$k] === false && ! is_bool($v)) return false; } return $d; } if (is_string($data)) { #use strpos() for speed improving of regexp if ($is_hex2bin && strpos($data, '0x') !== false) { $data = preg_replace_callback( '~0x((?:[\da-fA-F]{2})+)~sSX', function (array $m) { $s = pack('H' . strlen($m[1]), $m[1]); #hex2bin() return rawurlencode($s); }, $data); } if (strpos($data, '%u') !== false) { $class = __CLASS__; $data = preg_replace_callback( '~%u( [\da-fA-F]{4}+ #%uXXXX only UCS-2 | \{ [\da-fA-F]{1,6}+ \} #%u{XXXXXX} extended form for all UNICODE charts ) ~sxSX', function (array $m) use ($class) { $codepoint = hexdec(trim($m[1], '{}')); $char = $class::chr($codepoint); return rawurlencode($char); }, $data); } return $is_urldecode ? urldecode($data) : $data; } if (is_scalar($data) || is_null($data)) return $data; #~ null, integer, float, boolean return false; #object or resource } /** * 1) Corrects the global arrays $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES * decoded values ​​from %XX and extended %uXXXX / %u{XXXXXX} format, * for example, through an outdated javascript function escape(). * Standard PHP5 cannot do it. * 2) Recode $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES from $charset * encoding to UTF-8, if necessary. * A side effect is a positive protection against XSS attacks with * non-printable characters on the vulnerable PHP function. * Thus web forms can be sent to the server in 2-encoding: $charset and UTF-8. * For example: ?тест[тест]=тест * 3) If in the HTTP_COOKIE there are parameters with the same name, * takes the last value (as in the QUERY_STRING), not the first. * 4) Creates an array of $_POST for non-standard Content-Type, for example, * "Content-Type: application/octet-stream". Standard PHP5 creates * an array for "Content-Type: application/x-www-form-urlencoded" * and "Content-Type: multipart/form-data". * * Examples * '%F2%E5%F1%F2' => 'тест' #CP1251 (regular) * '0xF2E5F1F2' => 'тест' #CP1251 (compact) * '%D1%82%D0%B5%D1%81%D1%82' => 'тест' #UTF-8 (regular) * '0xD182D0B5D181D182' => 'тест' #UTF-8 (compact) * '%u0442%u0435%u0441%u0442' => 'тест' #UCS-2 (U+0 — U+FFFF) * '%u{442}%u{435}%u{0441}%u{00442}' => 'тест' #UTF-8 (U+0 — U+FFFFFF) * * Сессии, куки и независимая авторизация на поддоменах. * * ПРИМЕР 1 * У рабочего сайта http://domain.com появились поддомены. * Для кроссдоменной авторизации через механизм сессий имя хоста для COOKIE было изменено с "domain.com" на ".domain.com" * В результате авторизация не работает. Решение: поменять имя сессии. * Ещё помогает очистка COOKIE, но их принудительная очистка на тысячах пользовательских компьютеров проблематична. * PHP не правильно (?) обрабатывает заголовок HTTP_COOKIE, если там встречаются параметры с одинаковым именем, но разными значениями. * Пример запроса HTTP-заголовка клиентом: "Cookie: sid=chpgs2fiak-330mzqza; sid=cmz5tnp5zz-xlbbgqp" * В этом случае сервер берёт первое значение, а не последнее. * Хотя если в QUERY_STRING есть такая ситуация, всегда берётся последний параметр. * В HTTP_COOKIE два параметра с одинаковым именем могут появиться, если отправить клиенту следующие HTTP-заголовки: * "Set-Cookie: sid=chpgs2fiak-330mzqza; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=domain.com" (только domain.com) * "Set-Cookie: sid=cmz6uqorzv-1bn35110; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.domain.com" (domain.com и все его поддомены) * * ПРИМЕР 2 * Есть рабочие сайты: http://domain.com (основной), http://admin.domain.com (админка), * http://sub1.domain.com (подпроект 1), http://sub2.domain.com, (подпроект 2). * Так же имеется сервер разработки http://dev.domain.com, на котором м. б. свои поддомены. * Требуется сделать независимую кросс-доменную авторизацию для http://*.domain.com и http://*.dev.domain.com. * Для сохранения статуса авторизации будем использовать сессию, имя и значение которой пишется в COOKIE. * Т. к. домены http://*.dev.domain.com имеют пересечение с доменами http://*.domain.com, * для независимой авторизации нужно использовать разные имена сессий! * Пример HTTP заголовков ответа сервера: * "Set-Cookie: sid=chpgs2fiak-330mzqza; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.domain.com" (.domain.com и все его поддомены) * "Set-Cookie: sid.dev=cmz6uqorzv-1bn35110; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.dev.domain.com" (dev.domain.com и все его поддомены) * * @link http://tools.ietf.org/html/rfc2965 RFC 2965 - HTTP State Management Mechanism * @param bool $is_hex2bin Decode the HEX-data? * Example: '0xD182D0B5D181D182' => "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" * Hint: parameters in the URL address is sometimes * convenient to encode not function rawurlencode($string), * and use the following mechanism (encoded data is more compact): * '0x' . bin2hex($string) * @param string $charset * @return bool */ public static function unescape_request($is_hex2bin = false, $charset = 'ISO-8859-1') { $fixed = false; #ATTENTION! HTTP_RAW_POST_DATA is only accessible when Content-Type of POST request is NOT default "application/x-www-form-urlencoded"! $HTTP_RAW_POST_DATA = isset($_SERVER['REQUEST_METHOD']) && $_SERVER['REQUEST_METHOD'] === 'POST' ? (isset($GLOBALS['HTTP_RAW_POST_DATA']) ? $GLOBALS['HTTP_RAW_POST_DATA'] : @file_get_contents('php://input')) : null; if (ini_get('always_populate_raw_post_data')) $GLOBALS['HTTP_RAW_POST_DATA'] = $HTTP_RAW_POST_DATA; foreach (array( '_GET' => isset($_SERVER['QUERY_STRING']) ? $_SERVER['QUERY_STRING'] : null, '_POST' => $HTTP_RAW_POST_DATA, '_COOKIE' => isset($_SERVER['HTTP_COOKIE']) ? $_SERVER['HTTP_COOKIE'] : null, '_FILES' => isset($_FILES) ? $_FILES : null, ) as $k => $v) { if (! is_string($v)) continue; if ($k === '_COOKIE') { $v = preg_replace('/; *+/sSX', '&', $v); unset($_COOKIE); #будем парсить HTTP_COOKIE сами, чтобы сделать обработку как у QUERY_STRING } $v = self::unescape($v, $is_hex2bin, false); if ($v === false) return false; parse_str($v, $GLOBALS[$k]); $GLOBALS[$k] = self::convert_from($GLOBALS[$k], $charset); if ($GLOBALS[$k] === false) { trigger_error('Array $' . $k . ' does not have keys/values in UTF-8 charset!', E_USER_WARNING); return false; } $fixed = true; } if ($fixed) { $_REQUEST = (isset($_COOKIE) ? $_COOKIE : array()) + (isset($_POST) ? $_POST : array()) + (isset($_GET) ? $_GET : array()); } return true; } /** * Calculates the height of the edit text in