diff --git a/CHANGELOG.md b/CHANGELOG.md index eb5958ce..657dabff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ **Merged pull requests:** - Release v2.1.5-2023.10 🎉 +- Updated UTF8 & ReflectionTypeHint classes [\#318](https://github.com/torrentpier/torrentpier-lts/pull/318) ([belomaxorka](https://github.com/belomaxorka)) - Updated Text_LangCorrect class [\#309](https://github.com/torrentpier/torrentpier-lts/pull/309) ([belomaxorka](https://github.com/belomaxorka)) - Minor improvements [\#297](https://github.com/torrentpier/torrentpier-lts/pull/297), [\#298](https://github.com/torrentpier/torrentpier-lts/pull/298), [\#300](https://github.com/torrentpier/torrentpier-lts/pull/300), [\#301](https://github.com/torrentpier/torrentpier-lts/pull/301), [\#302](https://github.com/torrentpier/torrentpier-lts/pull/302), [\#303](https://github.com/torrentpier/torrentpier-lts/pull/303), [\#305](https://github.com/torrentpier/torrentpier-lts/pull/305), [\#306](https://github.com/torrentpier/torrentpier-lts/pull/306), [\#307](https://github.com/torrentpier/torrentpier-lts/pull/307), [\#310](https://github.com/torrentpier/torrentpier-lts/pull/310), [\#312](https://github.com/torrentpier/torrentpier-lts/pull/312), [\#313](https://github.com/torrentpier/torrentpier-lts/pull/313), [\#315](https://github.com/torrentpier/torrentpier-lts/pull/315), [\#316](https://github.com/torrentpier/torrentpier-lts/pull/316), [\#317](https://github.com/torrentpier/torrentpier-lts/pull/317) ([belomaxorka](https://github.com/belomaxorka)) diff --git a/library/includes/classes/reflection.php b/library/includes/classes/reflection.php index ba0f5905..c5bcc5e1 100644 --- a/library/includes/classes/reflection.php +++ b/library/includes/classes/reflection.php @@ -1,21 +1,36 @@ - 'is_int', - 'integer' => 'is_int', - 'digit' => 'ctype_digit', - 'number' => 'ctype_digit', - 'float' => 'is_float', - 'double' => 'is_float', - 'real' => 'is_float', - 'numeric' => 'is_numeric', - 'str' => 'is_string', - 'string' => 'is_string', - 'char' => 'is_string', - 'bool' => 'is_bool', - 'boolean' => 'is_bool', - 'null' => 'is_null', - 'array' => 'is_array', - 'obj' => 'is_object', - 'object' => 'is_object', - 'res' => 'is_resource', + 'int' => 'is_int', + 'integer' => 'is_int', + 'digit' => 'ctype_digit', + 'number' => 'ctype_digit', + 'float' => 'is_float', + 'double' => 'is_float', + 'real' => 'is_float', + 'numeric' => 'is_numeric', + 'str' => 'is_string', + 'string' => 'is_string', + 'char' => 'is_string', + 'bool' => 'is_bool', + 'boolean' => 'is_bool', + 'null' => 'is_null', + 'array' => 'is_array', + 'obj' => 'is_object', + 'object' => 'is_object', + 'res' => 'is_resource', 'resource' => 'is_resource', - 'scalar' => 'is_scalar', #integer, float, string or boolean - 'cb' => 'is_callable', + 'scalar' => 'is_scalar', #integer, float, string or boolean + 'cb' => 'is_callable', 'callback' => 'is_callable', ); #calling the methods of this class only statically! - private function __construct() {} + private function __construct() + { + } public static function isValid() { - if (! assert_options(ASSERT_ACTIVE)) return true; + if (!assert_options(ASSERT_ACTIVE)) return true; $bt = self::debugBacktrace(null, 1); extract($bt); //to $file, $line, $function, $class, $object, $type, $args - if (! $args) return true; #speed improve + if (!$args) return true; #speed improve $r = new ReflectionMethod($class, $function); $doc = $r->getDocComment(); - $cache_id = $class. $type. $function; + $cache_id = $class . $type . $function; preg_match_all('~ [\r\n]++ [\x20\t]++ \* [\x20\t]++ @param [\x20\t]++ @@ -79,34 +95,30 @@ class ReflectionTypeHint ~sixSX', $doc, $params, PREG_SET_ORDER); $parameters = $r->getParameters(); //d($args, $params, $parameters); - if (count($parameters) > count($params)) - { + if (count($parameters) > count($params)) { $message = 'phpDoc %d piece(s) @param description expected in %s%s%s(), %s given, ' . PHP_EOL - . 'called in %s on line %d ' . PHP_EOL - . 'and defined in %s on line %d'; + . 'called in %s on line %d ' . PHP_EOL + . 'and defined in %s on line %d'; $message = sprintf($message, count($parameters), $class, $type, $function, count($params), $file, $line, $r->getFileName(), $r->getStartLine()); trigger_error($message, E_USER_NOTICE); } - foreach ($args as $i => $value) - { - if (! isset($params[$i])) return true; - if ($parameters[$i]->name !== $params[$i][2]) - { + foreach ($args as $i => $value) { + if (!isset($params[$i])) return true; + if ($parameters[$i]->name !== $params[$i][2]) { $param_num = $i + 1; $message = 'phpDoc @param %d in %s%s%s() must be named as $%s, $%s given, ' . PHP_EOL - . 'called in %s on line %d ' . PHP_EOL - . 'and defined in %s on line %d'; + . 'called in %s on line %d ' . PHP_EOL + . 'and defined in %s on line %d'; $message = sprintf($message, $param_num, $class, $type, $function, $parameters[$i]->name, $params[$i][2], $file, $line, $r->getFileName(), $r->getStartLine()); trigger_error($message, E_USER_NOTICE); } $hints = preg_split('~[|/,]~sSX', $params[$i][1]); - if (! self::checkValueTypes($hints, $value)) - { + if (!self::checkValueTypes($hints, $value)) { $param_num = $i + 1; $message = 'Argument %d passed to %s%s%s() must be an %s, %s given, ' . PHP_EOL - . 'called in %s on line %d ' . PHP_EOL - . 'and defined in %s on line %d'; + . 'called in %s on line %d ' . PHP_EOL + . 'and defined in %s on line %d'; $message = sprintf($message, $param_num, $class, $type, $function, implode('|', $hints), (is_object($value) ? get_class($value) . ' ' : '') . gettype($value), $file, $line, $r->getFileName(), $r->getStartLine()); trigger_error($message, E_USER_WARNING); return false; @@ -120,8 +132,8 @@ class ReflectionTypeHint * (totally skip them correcting caller references). * If $return_frame is present, return only $return_frame matched caller, not all stacktrace. * - * @param string|null $re_ignore example: '~^' . preg_quote(__CLASS__, '~') . '(?![a-zA-Z\d])~sSX' - * @param int|null $return_frame + * @param string|null $re_ignore example: '~^' . preg_quote(__CLASS__, '~') . '(?![a-zA-Z\d])~sSX' + * @param int|null $return_frame * @return array */ public static function debugBacktrace($re_ignore = null, $return_frame = null) @@ -130,20 +142,18 @@ class ReflectionTypeHint $a = array(); $frames = 0; - for ($i = 0, $n = count($trace); $i < $n; $i++) - { + for ($i = 0, $n = count($trace); $i < $n; $i++) { $t = $trace[$i]; - if (! $t) continue; + if (!$t) continue; // Next frame. - $next = isset($trace[$i+1])? $trace[$i+1] : null; + $next = isset($trace[$i + 1]) ? $trace[$i + 1] : null; // Dummy frame before call_user_func*() frames. - if (! isset($t['file']) && $next) - { - $t['over_function'] = $trace[$i+1]['function']; - $t = $t + $trace[$i+1]; - $trace[$i+1] = null; // skip call_user_func on next iteration + if (!isset($t['file']) && $next) { + $t['over_function'] = $trace[$i + 1]['function']; + $t = $t + $trace[$i + 1]; + $trace[$i + 1] = null; // skip call_user_func on next iteration } // Skip myself frame. @@ -151,11 +161,10 @@ class ReflectionTypeHint // 'class' and 'function' field of next frame define where this frame function situated. // Skip frames for functions situated in ignored places. - if ($re_ignore && $next) - { + if ($re_ignore && $next) { // Name of function "inside which" frame was generated. $frame_caller = (isset($next['class']) ? $next['class'] . $next['type'] : '') - . (isset($next['function']) ? $next['function'] : ''); + . (isset($next['function']) ? $next['function'] : ''); if (preg_match($re_ignore, $frame_caller)) continue; } @@ -169,14 +178,13 @@ class ReflectionTypeHint /** * Checks a value to the allowed types * - * @param array $types - * @param mixed $value + * @param array $types + * @param mixed $value * @return bool */ public static function checkValueTypes(array $types, $value) { - foreach ($types as $type) - { + foreach ($types as $type) { $type = strtolower($type); if (array_key_exists($type, self::$hints) && call_user_func(self::$hints[$type], $value)) return true; if (is_object($value) && @is_a($value, $type)) return true; @@ -184,4 +192,4 @@ class ReflectionTypeHint } return false; } -} \ No newline at end of file +} diff --git a/library/includes/classes/utf8.php b/library/includes/classes/utf8.php index 4a03eac1..964e65b4 100644 --- a/library/includes/classes/utf8.php +++ b/library/includes/classes/utf8.php @@ -1,37 +1,42 @@ - = 5.3.x + * * The methods that take and return a string, are able to take and return null. + * This useful for selects from a database. + * * Several methods are able to process arrays recursively: + * array_change_key_case(), convert_from(), convert_to(), strict(), is_utf8(), blocks_check(), convert_case(), lowercase(), uppercase(), unescape() + * * Validating method parameters to allowed types via reflection (You can disable it) + * * A single interface and encapsulation, You can inherit and override + * * Test coverage + * * PHP >= 5.3.x * * In Russian: * * Поддержка UTF-8 в PHP 5. * - * Возможности и преимущества использования этого класса + * Возможности и преимущества * * Совместимость с интерфейсом стандартных PHP функций, работающих с однобайтовыми кодировками * * Возможность работы без PHP расширений ICONV и MBSTRING, если они есть, то активно используются! + * Используется наиболее быстрый из доступных методов между MBSTRING, ICONV, родной реализацией на PHP и хаками. * * Полезные функции, отсутствующие в ICONV и MBSTRING - * * Методы, которые принимают и возвращают строку, умеют принимать и возвращать null (удобно при выборках значений из базы данных) - * * Несколько методов умеют обрабатывать массивы рекурсивно - * * Единый интерфейс и инкапсуляция (можно унаследоваться и переопределить методы) - * * Высокая производительность, надёжность и качественный код + * * Методы, которые принимают и возвращают строку, умеют принимать и возвращать null. + * Это удобно при выборках значений из базы данных. + * * Несколько методов умеют обрабатывать массивы рекурсивно: + * array_change_key_case(), convert_from(), convert_to(), strict(), is_utf8(), blocks_check(), convert_case(), lowercase(), uppercase(), unescape() + * * Проверка у методов входных параметров на допустимые типы через рефлексию (можно отключить) + * * Единый интерфейс и инкапсуляция, можно унаследоваться и переопределить методы + * * Покрытие тестами * * PHP >= 5.3.x * * Example: @@ -60,32 +65,47 @@ if (!defined('BB_ROOT')) die(basename(__FILE__)); * @link http://code.google.com/p/php5-utf8/ * @license http://creativecommons.org/licenses/by-sa/3.0/ * @author Nasibullin Rinat - * @version 2.2.2 + * @version 2.3.1 */ - class UTF8 { - #REPLACEMENT CHARACTER (for broken char) + /** + * REPLACEMENT CHARACTER (for broken char) + * + * @var string + */ const REPLACEMENT_CHAR = "\xEF\xBF\xBD"; #U+FFFD /** - * Regular expression for a character in UTF-8 without the use of a flag /u - * @deprecated Instead, use a dot (".") and the flag /u, it works faster! + * Byte order mark, http://en.wikipedia.org/wiki/Byte_Order_Mark + * * @var string */ - public static $char_re = ' [\x09\x0A\x0D\x20-\x7E] # ASCII strict - # [\x00-\x7F] # ASCII non-strict (including control chars) - | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte - | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs - | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte - | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates - | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 - | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 - | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 - '; + const BOM = "\xEF\xBB\xBF"; + + /** + * Regular expression for a character in UTF-8. + * For engines, which don't support UTF8 mode. + * In PCRE use a dot (".") and the flag /u, it works much faster! + * + * @var string + */ + const CHAR_RE = + '[\x09\x0A\x0D\x20-\x7E] # ASCII strict + # [\x00-\x7F] # ASCII non-strict (including control chars) + | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte + | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs + | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte + | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates + | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 + | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 + | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 + '; /** * Combining diactrical marks (Unicode 5.1). + * \p{M} in PCRE terms. + * For engines, which don't support UTF8 mode. * * For example, russian letters in composed form: "Ё" (U+0401), "Й" (U+0419), * decomposed form: (U+0415 U+0308), (U+0418 U+0306) @@ -95,21 +115,126 @@ class UTF8 * @link http://www.unicode.org/charts/PDF/UFE20.pdf * @var string */ - #public static $diactrical_re = '\p{M}'; #alternative, but only with /u flag - public static $diactrical_re = ' \xcc[\x80-\xb9]|\xcd[\x80-\xaf] #UNICODE range: U+0300 — U+036F (for letters) - | \xe2\x83[\x90-\xbf] #UNICODE range: U+20D0 — U+20FF (for symbols) - | \xe1\xb7[\x80-\xbf] #UNICODE range: U+1DC0 — U+1DFF (supplement) - | \xef\xb8[\xa0-\xaf] #UNICODE range: U+FE20 — U+FE2F (combining half marks) - '; + const DIACTRICAL_RE = + ' \xcc[\x80-\xb9]|\xcd[\x80-\xaf] #UNICODE range: U+0300 — U+036F (for letters) + | \xe2\x83[\x90-\xbf] #UNICODE range: U+20D0 — U+20FF (for symbols) + | \xe1\xb7[\x80-\xbf] #UNICODE range: U+1DC0 — U+1DFF (supplement) + | \xef\xb8[\xa0-\xaf] #UNICODE range: U+FE20 — U+FE2F (combining half marks) + '; /** - * @var array + * \p{Lu} in PCRE terms. + * For engines, which don't support UTF8 mode. + * + * @var string + */ + const CHAR_UPPER_RE = '[\x41-\x5a] + | \xc3[\x80-\x9e] + | \xc4[\x80-\xbf] + | \xc5[\x81-\xbd] + | \xc6[\x81-\xbc] + | \xc7[\x85-\xbe] + | \xc8[\x80-\xb2] + | \xce[\x86-\xab] + | \xcf[\x98-\xae] + | \xd0[\x80-\xaf] + | \xd1[\xa0-\xbe] + | \xd2[\x80-\xbe] + | \xd3[\x81-\xb8] + | \xd4[\x80-\xbf] + | \xd5[\x80-\x96] + | \xe1[\xb8\xb9\xba][\x80-\xbe] + | \xe1\xbb[\x80-\xb8] + | \xe1\xbc[\x88-\xbf] + | \xe1\xbd[\x88-\xaf] + | \xe1[\xbe\xbf][\x88-\xbc] + | \xef\xbc[\xa1-\xba] + '; + + /** + * \p{Ll} in PCRE terms. + * For engines, which don't support UTF8 mode. + * + * @var string + */ + const CHAR_LOWER_RE = '[\x61-\x7a] + | \xc2\xb5 + | \xc3[\xa0-\xbf] + | \xc4[\x81-\xbe] + | \xc5[\x80-\xbe] + | \xc6[\x83-\xbf] + | \xc7[\x86-\xbf] + | \xc8[\x81-\xb3] + | \xc9[\x93-\xb5] + | \xca[\x80-\x92] + | \xce[\xac-\xbf] + | \xcf[\x80-\xaf] + | \xd0[\xb0-\xbf] + | \xd1[\x80-\xbf] + | \xd2[\x81-\xbf] + | \xd3[\x82-\xb9] + | \xd4[\x81-\x8f] + | \xd5[\xa1-\xbf] + | \xd6[\x80-\x86] + | \xe1[\xb8\xb9\xba][\x81-\xbf] + | \xe1\xbb[\x81-\xb9] + | \xe1\xbc[\x80-\xb7] + | \xe1\xbd[\x80-\xbd] + | \xe1\xbe[\x80-\xb3] + | \xe1\xbf[\x83-\xb3] + | \xef\xbd[\x81-\x9a] + '; + + /** + * HTML entities, examples: > Ö ˜ " + * + * @var string + */ + const HTML_ENTITY_RE = '&(?> [a-zA-Z][a-zA-Z\d]++ + | \#(?> \d{1,4}+ + | x[\da-fA-F]{2,4}+ + ) + ); + '; + + /** + * Quotation marks. + * For engines, which don't support UTF8 mode. + * + * @var string + */ + const QUOTATION_MARK_RE = '\x22|\xc2[\xab\xbb]|\xe2\x80[\x98\x99\x9a\x9c\x9d\x9e\xb9\xba]'; + + /** + * + * @var array + */ + public static $html_quotation_mark_table = array( + '"' => "\x22", #U+0022 ["] " quotation mark = APL quote + '«' => "\xc2\xab", #U+00AB [«] left-pointing double angle quotation mark = left pointing guillemet + '»' => "\xc2\xbb", #U+00BB [»] right-pointing double angle quotation mark = right pointing guillemet + '‘' => "\xe2\x80\x98", #U+2018 [‘] left single quotation mark + '’' => "\xe2\x80\x99", #U+2019 [’] right single quotation mark (and apostrophe!) + '‚' => "\xe2\x80\x9a", #U+201A [‚] single low-9 quotation mark + '“' => "\xe2\x80\x9c", #U+201C [“] left double quotation mark + '”' => "\xe2\x80\x9d", #U+201D [”] right double quotation mark + '„' => "\xe2\x80\x9e", #U+201E [„] double low-9 quotation mark + '‹' => "\xe2\x80\xb9", #U+2039 [‹] single left-pointing angle quotation mark + '›' => "\xe2\x80\xba", #U+203A [›] single right-pointing angle quotation mark + ); + + /** + * HTML special chars table + * + * @var array */ public static $html_special_chars_table = array( '"' => "\x22", #U+0022 ["] " quotation mark = APL quote - '&' => "\x26", #U+0026 [&] & ampersand - '<' => "\x3c", #U+003C [<] < less-than sign - '>' => "\x3e", #U+003E [>] > greater-than sign + '&' => "\x26", #U+0026 [&] & ampersand + '<' => "\x3c", #U+003C [<] < less-than sign + '>' => "\x3e", #U+003E [>] > greater-than sign + #' entity is only available in XHTML/HTML5 and not in plain HTML, see http://www.w3.org/TR/xhtml1/#C_16 + #''' => "\x27", #U+0027 ['] ' apostrophe ); /** @@ -118,34 +243,34 @@ class UTF8 */ public static $html_entity_table = array( #Latin-1 Entities: - ' ' => "\xc2\xa0", #U+00A0 [ ] no-break space = non-breaking space - '¡' => "\xc2\xa1", #U+00A1 [¡] inverted exclamation mark - '¢' => "\xc2\xa2", #U+00A2 [¢] cent sign - '£' => "\xc2\xa3", #U+00A3 [£] pound sign + ' ' => "\xc2\xa0", #U+00A0 [ ] no-break space = non-breaking space + '¡' => "\xc2\xa1", #U+00A1 [¡] inverted exclamation mark + '¢' => "\xc2\xa2", #U+00A2 [¢] cent sign + '£' => "\xc2\xa3", #U+00A3 [£] pound sign '¤' => "\xc2\xa4", #U+00A4 [¤] currency sign - '¥' => "\xc2\xa5", #U+00A5 [¥] yen sign = yuan sign + '¥' => "\xc2\xa5", #U+00A5 [¥] yen sign = yuan sign '¦' => "\xc2\xa6", #U+00A6 [¦] broken bar = broken vertical bar - '§' => "\xc2\xa7", #U+00A7 [§] section sign - '¨' => "\xc2\xa8", #U+00A8 [¨] diaeresis = spacing diaeresis - '©' => "\xc2\xa9", #U+00A9 [©] copyright sign - 'ª' => "\xc2\xaa", #U+00AA [ª] feminine ordinal indicator - '«' => "\xc2\xab", #U+00AB [«] left-pointing double angle quotation mark = left pointing guillemet - '¬' => "\xc2\xac", #U+00AC [¬] not sign - '­' => "\xc2\xad", #U+00AD [ ] soft hyphen = discretionary hyphen - '®' => "\xc2\xae", #U+00AE [®] registered sign = registered trade mark sign - '¯' => "\xc2\xaf", #U+00AF [¯] macron = spacing macron = overline = APL overbar - '°' => "\xc2\xb0", #U+00B0 [°] degree sign + '§' => "\xc2\xa7", #U+00A7 [§] section sign + '¨' => "\xc2\xa8", #U+00A8 [¨] diaeresis = spacing diaeresis + '©' => "\xc2\xa9", #U+00A9 [©] copyright sign + 'ª' => "\xc2\xaa", #U+00AA [ª] feminine ordinal indicator + '«' => "\xc2\xab", #U+00AB [«] left-pointing double angle quotation mark = left pointing guillemet + '¬' => "\xc2\xac", #U+00AC [¬] not sign + '­' => "\xc2\xad", #U+00AD [ ] soft hyphen = discretionary hyphen + '®' => "\xc2\xae", #U+00AE [®] registered sign = registered trade mark sign + '¯' => "\xc2\xaf", #U+00AF [¯] macron = spacing macron = overline = APL overbar + '°' => "\xc2\xb0", #U+00B0 [°] degree sign '±' => "\xc2\xb1", #U+00B1 [±] plus-minus sign = plus-or-minus sign - '²' => "\xc2\xb2", #U+00B2 [²] superscript two = superscript digit two = squared - '³' => "\xc2\xb3", #U+00B3 [³] superscript three = superscript digit three = cubed - '´' => "\xc2\xb4", #U+00B4 [´] acute accent = spacing acute - 'µ' => "\xc2\xb5", #U+00B5 [µ] micro sign - '¶' => "\xc2\xb6", #U+00B6 [¶] pilcrow sign = paragraph sign + '²' => "\xc2\xb2", #U+00B2 [²] superscript two = superscript digit two = squared + '³' => "\xc2\xb3", #U+00B3 [³] superscript three = superscript digit three = cubed + '´' => "\xc2\xb4", #U+00B4 [´] acute accent = spacing acute + 'µ' => "\xc2\xb5", #U+00B5 [µ] micro sign + '¶' => "\xc2\xb6", #U+00B6 [¶] pilcrow sign = paragraph sign '·' => "\xc2\xb7", #U+00B7 [·] middle dot = Georgian comma = Greek middle dot - '¸' => "\xc2\xb8", #U+00B8 [¸] cedilla = spacing cedilla - '¹' => "\xc2\xb9", #U+00B9 [¹] superscript one = superscript digit one - 'º' => "\xc2\xba", #U+00BA [º] masculine ordinal indicator - '»' => "\xc2\xbb", #U+00BB [»] right-pointing double angle quotation mark = right pointing guillemet + '¸' => "\xc2\xb8", #U+00B8 [¸] cedilla = spacing cedilla + '¹' => "\xc2\xb9", #U+00B9 [¹] superscript one = superscript digit one + 'º' => "\xc2\xba", #U+00BA [º] masculine ordinal indicator + '»' => "\xc2\xbb", #U+00BB [»] right-pointing double angle quotation mark = right pointing guillemet '¼' => "\xc2\xbc", #U+00BC [¼] vulgar fraction one quarter = fraction one quarter '½' => "\xc2\xbd", #U+00BD [½] vulgar fraction one half = fraction one half '¾' => "\xc2\xbe", #U+00BE [¾] vulgar fraction three quarters = fraction three quarters @@ -153,224 +278,224 @@ class UTF8 #Latin capital letter 'À' => "\xc3\x80", #Latin capital letter A with grave = Latin capital letter A grave 'Á' => "\xc3\x81", #Latin capital letter A with acute - 'Â' => "\xc3\x82", #Latin capital letter A with circumflex + 'Â' => "\xc3\x82", #Latin capital letter A with circumflex 'Ã' => "\xc3\x83", #Latin capital letter A with tilde - 'Ä' => "\xc3\x84", #Latin capital letter A with diaeresis - 'Å' => "\xc3\x85", #Latin capital letter A with ring above = Latin capital letter A ring - 'Æ' => "\xc3\x86", #Latin capital letter AE = Latin capital ligature AE + 'Ä' => "\xc3\x84", #Latin capital letter A with diaeresis + 'Å' => "\xc3\x85", #Latin capital letter A with ring above = Latin capital letter A ring + 'Æ' => "\xc3\x86", #Latin capital letter AE = Latin capital ligature AE 'Ç' => "\xc3\x87", #Latin capital letter C with cedilla 'È' => "\xc3\x88", #Latin capital letter E with grave 'É' => "\xc3\x89", #Latin capital letter E with acute - 'Ê' => "\xc3\x8a", #Latin capital letter E with circumflex - 'Ë' => "\xc3\x8b", #Latin capital letter E with diaeresis + 'Ê' => "\xc3\x8a", #Latin capital letter E with circumflex + 'Ë' => "\xc3\x8b", #Latin capital letter E with diaeresis 'Ì' => "\xc3\x8c", #Latin capital letter I with grave 'Í' => "\xc3\x8d", #Latin capital letter I with acute - 'Î' => "\xc3\x8e", #Latin capital letter I with circumflex - 'Ï' => "\xc3\x8f", #Latin capital letter I with diaeresis - 'Ð' => "\xc3\x90", #Latin capital letter ETH + 'Î' => "\xc3\x8e", #Latin capital letter I with circumflex + 'Ï' => "\xc3\x8f", #Latin capital letter I with diaeresis + 'Ð' => "\xc3\x90", #Latin capital letter ETH 'Ñ' => "\xc3\x91", #Latin capital letter N with tilde 'Ò' => "\xc3\x92", #Latin capital letter O with grave 'Ó' => "\xc3\x93", #Latin capital letter O with acute - 'Ô' => "\xc3\x94", #Latin capital letter O with circumflex + 'Ô' => "\xc3\x94", #Latin capital letter O with circumflex 'Õ' => "\xc3\x95", #Latin capital letter O with tilde - 'Ö' => "\xc3\x96", #Latin capital letter O with diaeresis - '×' => "\xc3\x97", #U+00D7 [×] multiplication sign + 'Ö' => "\xc3\x96", #Latin capital letter O with diaeresis + '×' => "\xc3\x97", #U+00D7 [×] multiplication sign 'Ø' => "\xc3\x98", #Latin capital letter O with stroke = Latin capital letter O slash 'Ù' => "\xc3\x99", #Latin capital letter U with grave 'Ú' => "\xc3\x9a", #Latin capital letter U with acute - 'Û' => "\xc3\x9b", #Latin capital letter U with circumflex - 'Ü' => "\xc3\x9c", #Latin capital letter U with diaeresis + 'Û' => "\xc3\x9b", #Latin capital letter U with circumflex + 'Ü' => "\xc3\x9c", #Latin capital letter U with diaeresis 'Ý' => "\xc3\x9d", #Latin capital letter Y with acute - 'Þ' => "\xc3\x9e", #Latin capital letter THORN + 'Þ' => "\xc3\x9e", #Latin capital letter THORN #Latin small letter - 'ß' => "\xc3\x9f", #Latin small letter sharp s = ess-zed + 'ß' => "\xc3\x9f", #Latin small letter sharp s = ess-zed 'à' => "\xc3\xa0", #Latin small letter a with grave = Latin small letter a grave 'á' => "\xc3\xa1", #Latin small letter a with acute - 'â' => "\xc3\xa2", #Latin small letter a with circumflex + 'â' => "\xc3\xa2", #Latin small letter a with circumflex 'ã' => "\xc3\xa3", #Latin small letter a with tilde - 'ä' => "\xc3\xa4", #Latin small letter a with diaeresis - 'å' => "\xc3\xa5", #Latin small letter a with ring above = Latin small letter a ring - 'æ' => "\xc3\xa6", #Latin small letter ae = Latin small ligature ae + 'ä' => "\xc3\xa4", #Latin small letter a with diaeresis + 'å' => "\xc3\xa5", #Latin small letter a with ring above = Latin small letter a ring + 'æ' => "\xc3\xa6", #Latin small letter ae = Latin small ligature ae 'ç' => "\xc3\xa7", #Latin small letter c with cedilla 'è' => "\xc3\xa8", #Latin small letter e with grave 'é' => "\xc3\xa9", #Latin small letter e with acute - 'ê' => "\xc3\xaa", #Latin small letter e with circumflex - 'ë' => "\xc3\xab", #Latin small letter e with diaeresis + 'ê' => "\xc3\xaa", #Latin small letter e with circumflex + 'ë' => "\xc3\xab", #Latin small letter e with diaeresis 'ì' => "\xc3\xac", #Latin small letter i with grave 'í' => "\xc3\xad", #Latin small letter i with acute - 'î' => "\xc3\xae", #Latin small letter i with circumflex - 'ï' => "\xc3\xaf", #Latin small letter i with diaeresis - 'ð' => "\xc3\xb0", #Latin small letter eth + 'î' => "\xc3\xae", #Latin small letter i with circumflex + 'ï' => "\xc3\xaf", #Latin small letter i with diaeresis + 'ð' => "\xc3\xb0", #Latin small letter eth 'ñ' => "\xc3\xb1", #Latin small letter n with tilde 'ò' => "\xc3\xb2", #Latin small letter o with grave 'ó' => "\xc3\xb3", #Latin small letter o with acute - 'ô' => "\xc3\xb4", #Latin small letter o with circumflex + 'ô' => "\xc3\xb4", #Latin small letter o with circumflex 'õ' => "\xc3\xb5", #Latin small letter o with tilde - 'ö' => "\xc3\xb6", #Latin small letter o with diaeresis + 'ö' => "\xc3\xb6", #Latin small letter o with diaeresis '÷' => "\xc3\xb7", #U+00F7 [÷] division sign 'ø' => "\xc3\xb8", #Latin small letter o with stroke = Latin small letter o slash 'ù' => "\xc3\xb9", #Latin small letter u with grave 'ú' => "\xc3\xba", #Latin small letter u with acute - 'û' => "\xc3\xbb", #Latin small letter u with circumflex - 'ü' => "\xc3\xbc", #Latin small letter u with diaeresis + 'û' => "\xc3\xbb", #Latin small letter u with circumflex + 'ü' => "\xc3\xbc", #Latin small letter u with diaeresis 'ý' => "\xc3\xbd", #Latin small letter y with acute - 'þ' => "\xc3\xbe", #Latin small letter thorn - 'ÿ' => "\xc3\xbf", #Latin small letter y with diaeresis + 'þ' => "\xc3\xbe", #Latin small letter thorn + 'ÿ' => "\xc3\xbf", #Latin small letter y with diaeresis #Symbols and Greek Letters: - 'ƒ' => "\xc6\x92", #U+0192 [ƒ] Latin small f with hook = function = florin - 'Α' => "\xce\x91", #Greek capital letter alpha - 'Β' => "\xce\x92", #Greek capital letter beta - 'Γ' => "\xce\x93", #Greek capital letter gamma - 'Δ' => "\xce\x94", #Greek capital letter delta + 'ƒ' => "\xc6\x92", #U+0192 [ƒ] Latin small f with hook = function = florin + 'Α' => "\xce\x91", #Greek capital letter alpha + 'Β' => "\xce\x92", #Greek capital letter beta + 'Γ' => "\xce\x93", #Greek capital letter gamma + 'Δ' => "\xce\x94", #Greek capital letter delta 'Ε' => "\xce\x95", #Greek capital letter epsilon - 'Ζ' => "\xce\x96", #Greek capital letter zeta - 'Η' => "\xce\x97", #Greek capital letter eta - 'Θ' => "\xce\x98", #Greek capital letter theta - 'Ι' => "\xce\x99", #Greek capital letter iota - 'Κ' => "\xce\x9a", #Greek capital letter kappa - 'Λ' => "\xce\x9b", #Greek capital letter lambda - 'Μ' => "\xce\x9c", #Greek capital letter mu - 'Ν' => "\xce\x9d", #Greek capital letter nu - 'Ξ' => "\xce\x9e", #Greek capital letter xi + 'Ζ' => "\xce\x96", #Greek capital letter zeta + 'Η' => "\xce\x97", #Greek capital letter eta + 'Θ' => "\xce\x98", #Greek capital letter theta + 'Ι' => "\xce\x99", #Greek capital letter iota + 'Κ' => "\xce\x9a", #Greek capital letter kappa + 'Λ' => "\xce\x9b", #Greek capital letter lambda + 'Μ' => "\xce\x9c", #Greek capital letter mu + 'Ν' => "\xce\x9d", #Greek capital letter nu + 'Ξ' => "\xce\x9e", #Greek capital letter xi 'Ο' => "\xce\x9f", #Greek capital letter omicron - 'Π' => "\xce\xa0", #Greek capital letter pi - 'Ρ' => "\xce\xa1", #Greek capital letter rho - 'Σ' => "\xce\xa3", #Greek capital letter sigma - 'Τ' => "\xce\xa4", #Greek capital letter tau + 'Π' => "\xce\xa0", #Greek capital letter pi + 'Ρ' => "\xce\xa1", #Greek capital letter rho + 'Σ' => "\xce\xa3", #Greek capital letter sigma + 'Τ' => "\xce\xa4", #Greek capital letter tau 'Υ' => "\xce\xa5", #Greek capital letter upsilon - 'Φ' => "\xce\xa6", #Greek capital letter phi - 'Χ' => "\xce\xa7", #Greek capital letter chi - 'Ψ' => "\xce\xa8", #Greek capital letter psi - 'Ω' => "\xce\xa9", #Greek capital letter omega - 'α' => "\xce\xb1", #Greek small letter alpha - 'β' => "\xce\xb2", #Greek small letter beta - 'γ' => "\xce\xb3", #Greek small letter gamma - 'δ' => "\xce\xb4", #Greek small letter delta + 'Φ' => "\xce\xa6", #Greek capital letter phi + 'Χ' => "\xce\xa7", #Greek capital letter chi + 'Ψ' => "\xce\xa8", #Greek capital letter psi + 'Ω' => "\xce\xa9", #Greek capital letter omega + 'α' => "\xce\xb1", #Greek small letter alpha + 'β' => "\xce\xb2", #Greek small letter beta + 'γ' => "\xce\xb3", #Greek small letter gamma + 'δ' => "\xce\xb4", #Greek small letter delta 'ε' => "\xce\xb5", #Greek small letter epsilon - 'ζ' => "\xce\xb6", #Greek small letter zeta - 'η' => "\xce\xb7", #Greek small letter eta - 'θ' => "\xce\xb8", #Greek small letter theta - 'ι' => "\xce\xb9", #Greek small letter iota - 'κ' => "\xce\xba", #Greek small letter kappa - 'λ' => "\xce\xbb", #Greek small letter lambda - 'μ' => "\xce\xbc", #Greek small letter mu - 'ν' => "\xce\xbd", #Greek small letter nu - 'ξ' => "\xce\xbe", #Greek small letter xi + 'ζ' => "\xce\xb6", #Greek small letter zeta + 'η' => "\xce\xb7", #Greek small letter eta + 'θ' => "\xce\xb8", #Greek small letter theta + 'ι' => "\xce\xb9", #Greek small letter iota + 'κ' => "\xce\xba", #Greek small letter kappa + 'λ' => "\xce\xbb", #Greek small letter lambda + 'μ' => "\xce\xbc", #Greek small letter mu + 'ν' => "\xce\xbd", #Greek small letter nu + 'ξ' => "\xce\xbe", #Greek small letter xi 'ο' => "\xce\xbf", #Greek small letter omicron - 'π' => "\xcf\x80", #Greek small letter pi - 'ρ' => "\xcf\x81", #Greek small letter rho - 'ς' => "\xcf\x82", #Greek small letter final sigma - 'σ' => "\xcf\x83", #Greek small letter sigma - 'τ' => "\xcf\x84", #Greek small letter tau + 'π' => "\xcf\x80", #Greek small letter pi + 'ρ' => "\xcf\x81", #Greek small letter rho + 'ς' => "\xcf\x82", #Greek small letter final sigma + 'σ' => "\xcf\x83", #Greek small letter sigma + 'τ' => "\xcf\x84", #Greek small letter tau 'υ' => "\xcf\x85", #Greek small letter upsilon - 'φ' => "\xcf\x86", #Greek small letter phi - 'χ' => "\xcf\x87", #Greek small letter chi - 'ψ' => "\xcf\x88", #Greek small letter psi - 'ω' => "\xcf\x89", #Greek small letter omega - 'ϑ'=> "\xcf\x91", #Greek small letter theta symbol - 'ϒ' => "\xcf\x92", #Greek upsilon with hook symbol - 'ϖ' => "\xcf\x96", #U+03D6 [ϖ] Greek pi symbol + 'φ' => "\xcf\x86", #Greek small letter phi + 'χ' => "\xcf\x87", #Greek small letter chi + 'ψ' => "\xcf\x88", #Greek small letter psi + 'ω' => "\xcf\x89", #Greek small letter omega + 'ϑ' => "\xcf\x91", #Greek small letter theta symbol + 'ϒ' => "\xcf\x92", #Greek upsilon with hook symbol + 'ϖ' => "\xcf\x96", #U+03D6 [ϖ] Greek pi symbol - '•' => "\xe2\x80\xa2", #U+2022 [•] bullet = black small circle - '…' => "\xe2\x80\xa6", #U+2026 […] horizontal ellipsis = three dot leader - '′' => "\xe2\x80\xb2", #U+2032 [′] prime = minutes = feet (для обозначения минут и футов) - '″' => "\xe2\x80\xb3", #U+2033 [″] double prime = seconds = inches (для обозначения секунд и дюймов). - '‾' => "\xe2\x80\xbe", #U+203E [‾] overline = spacing overscore - '⁄' => "\xe2\x81\x84", #U+2044 [⁄] fraction slash - '℘' => "\xe2\x84\x98", #U+2118 [℘] script capital P = power set = Weierstrass p - 'ℑ' => "\xe2\x84\x91", #U+2111 [ℑ] blackletter capital I = imaginary part - 'ℜ' => "\xe2\x84\x9c", #U+211C [ℜ] blackletter capital R = real part symbol - '™' => "\xe2\x84\xa2", #U+2122 [™] trade mark sign + '•' => "\xe2\x80\xa2", #U+2022 [•] bullet = black small circle + '…' => "\xe2\x80\xa6", #U+2026 […] horizontal ellipsis = three dot leader + '′' => "\xe2\x80\xb2", #U+2032 [′] prime = minutes = feet (для обозначения минут и футов) + '″' => "\xe2\x80\xb3", #U+2033 [″] double prime = seconds = inches (для обозначения секунд и дюймов). + '‾' => "\xe2\x80\xbe", #U+203E [‾] overline = spacing overscore + '⁄' => "\xe2\x81\x84", #U+2044 [⁄] fraction slash + '℘' => "\xe2\x84\x98", #U+2118 [℘] script capital P = power set = Weierstrass p + 'ℑ' => "\xe2\x84\x91", #U+2111 [ℑ] blackletter capital I = imaginary part + 'ℜ' => "\xe2\x84\x9c", #U+211C [ℜ] blackletter capital R = real part symbol + '™' => "\xe2\x84\xa2", #U+2122 [™] trade mark sign 'ℵ' => "\xe2\x84\xb5", #U+2135 [ℵ] alef symbol = first transfinite cardinal - '←' => "\xe2\x86\x90", #U+2190 [←] leftwards arrow - '↑' => "\xe2\x86\x91", #U+2191 [↑] upwards arrow - '→' => "\xe2\x86\x92", #U+2192 [→] rightwards arrow - '↓' => "\xe2\x86\x93", #U+2193 [↓] downwards arrow - '↔' => "\xe2\x86\x94", #U+2194 [↔] left right arrow - '↵' => "\xe2\x86\xb5", #U+21B5 [↵] downwards arrow with corner leftwards = carriage return - '⇐' => "\xe2\x87\x90", #U+21D0 [⇐] leftwards double arrow - '⇑' => "\xe2\x87\x91", #U+21D1 [⇑] upwards double arrow - '⇒' => "\xe2\x87\x92", #U+21D2 [⇒] rightwards double arrow - '⇓' => "\xe2\x87\x93", #U+21D3 [⇓] downwards double arrow - '⇔' => "\xe2\x87\x94", #U+21D4 [⇔] left right double arrow - '∀' => "\xe2\x88\x80", #U+2200 [∀] for all - '∂' => "\xe2\x88\x82", #U+2202 [∂] partial differential - '∃' => "\xe2\x88\x83", #U+2203 [∃] there exists - '∅' => "\xe2\x88\x85", #U+2205 [∅] empty set = null set = diameter - '∇' => "\xe2\x88\x87", #U+2207 [∇] nabla = backward difference - '∈' => "\xe2\x88\x88", #U+2208 [∈] element of - '∉' => "\xe2\x88\x89", #U+2209 [∉] not an element of - '∋' => "\xe2\x88\x8b", #U+220B [∋] contains as member - '∏' => "\xe2\x88\x8f", #U+220F [∏] n-ary product = product sign - '∑' => "\xe2\x88\x91", #U+2211 [∑] n-ary sumation - '−' => "\xe2\x88\x92", #U+2212 [−] minus sign - '∗' => "\xe2\x88\x97", #U+2217 [∗] asterisk operator - '√' => "\xe2\x88\x9a", #U+221A [√] square root = radical sign - '∝' => "\xe2\x88\x9d", #U+221D [∝] proportional to - '∞' => "\xe2\x88\x9e", #U+221E [∞] infinity - '∠' => "\xe2\x88\xa0", #U+2220 [∠] angle - '∧' => "\xe2\x88\xa7", #U+2227 [∧] logical and = wedge - '∨' => "\xe2\x88\xa8", #U+2228 [∨] logical or = vee - '∩' => "\xe2\x88\xa9", #U+2229 [∩] intersection = cap - '∪' => "\xe2\x88\xaa", #U+222A [∪] union = cup - '∫' => "\xe2\x88\xab", #U+222B [∫] integral - '∴' => "\xe2\x88\xb4", #U+2234 [∴] therefore - '∼' => "\xe2\x88\xbc", #U+223C [∼] tilde operator = varies with = similar to - '≅' => "\xe2\x89\x85", #U+2245 [≅] approximately equal to - '≈' => "\xe2\x89\x88", #U+2248 [≈] almost equal to = asymptotic to - '≠' => "\xe2\x89\xa0", #U+2260 [≠] not equal to - '≡' => "\xe2\x89\xa1", #U+2261 [≡] identical to - '≤' => "\xe2\x89\xa4", #U+2264 [≤] less-than or equal to - '≥' => "\xe2\x89\xa5", #U+2265 [≥] greater-than or equal to - '⊂' => "\xe2\x8a\x82", #U+2282 [⊂] subset of - '⊃' => "\xe2\x8a\x83", #U+2283 [⊃] superset of - '⊄' => "\xe2\x8a\x84", #U+2284 [⊄] not a subset of - '⊆' => "\xe2\x8a\x86", #U+2286 [⊆] subset of or equal to - '⊇' => "\xe2\x8a\x87", #U+2287 [⊇] superset of or equal to - '⊕' => "\xe2\x8a\x95", #U+2295 [⊕] circled plus = direct sum - '⊗' => "\xe2\x8a\x97", #U+2297 [⊗] circled times = vector product - '⊥' => "\xe2\x8a\xa5", #U+22A5 [⊥] up tack = orthogonal to = perpendicular - '⋅' => "\xe2\x8b\x85", #U+22C5 [⋅] dot operator - '⌈' => "\xe2\x8c\x88", #U+2308 [⌈] left ceiling = APL upstile - '⌉' => "\xe2\x8c\x89", #U+2309 [⌉] right ceiling - '⌊' => "\xe2\x8c\x8a", #U+230A [⌊] left floor = APL downstile - '⌋' => "\xe2\x8c\x8b", #U+230B [⌋] right floor - '⟨' => "\xe2\x8c\xa9", #U+2329 [〈] left-pointing angle bracket = bra - '⟩' => "\xe2\x8c\xaa", #U+232A [〉] right-pointing angle bracket = ket - '◊' => "\xe2\x97\x8a", #U+25CA [◊] lozenge - '♠' => "\xe2\x99\xa0", #U+2660 [♠] black spade suit - '♣' => "\xe2\x99\xa3", #U+2663 [♣] black club suit = shamrock - '♥' => "\xe2\x99\xa5", #U+2665 [♥] black heart suit = valentine - '♦' => "\xe2\x99\xa6", #U+2666 [♦] black diamond suit + '←' => "\xe2\x86\x90", #U+2190 [←] leftwards arrow + '↑' => "\xe2\x86\x91", #U+2191 [↑] upwards arrow + '→' => "\xe2\x86\x92", #U+2192 [→] rightwards arrow + '↓' => "\xe2\x86\x93", #U+2193 [↓] downwards arrow + '↔' => "\xe2\x86\x94", #U+2194 [↔] left right arrow + '↵' => "\xe2\x86\xb5", #U+21B5 [↵] downwards arrow with corner leftwards = carriage return + '⇐' => "\xe2\x87\x90", #U+21D0 [⇐] leftwards double arrow + '⇑' => "\xe2\x87\x91", #U+21D1 [⇑] upwards double arrow + '⇒' => "\xe2\x87\x92", #U+21D2 [⇒] rightwards double arrow + '⇓' => "\xe2\x87\x93", #U+21D3 [⇓] downwards double arrow + '⇔' => "\xe2\x87\x94", #U+21D4 [⇔] left right double arrow + '∀' => "\xe2\x88\x80", #U+2200 [∀] for all + '∂' => "\xe2\x88\x82", #U+2202 [∂] partial differential + '∃' => "\xe2\x88\x83", #U+2203 [∃] there exists + '∅' => "\xe2\x88\x85", #U+2205 [∅] empty set = null set = diameter + '∇' => "\xe2\x88\x87", #U+2207 [∇] nabla = backward difference + '∈' => "\xe2\x88\x88", #U+2208 [∈] element of + '∉' => "\xe2\x88\x89", #U+2209 [∉] not an element of + '∋' => "\xe2\x88\x8b", #U+220B [∋] contains as member + '∏' => "\xe2\x88\x8f", #U+220F [∏] n-ary product = product sign + '∑' => "\xe2\x88\x91", #U+2211 [∑] n-ary sumation + '−' => "\xe2\x88\x92", #U+2212 [−] minus sign + '∗' => "\xe2\x88\x97", #U+2217 [∗] asterisk operator + '√' => "\xe2\x88\x9a", #U+221A [√] square root = radical sign + '∝' => "\xe2\x88\x9d", #U+221D [∝] proportional to + '∞' => "\xe2\x88\x9e", #U+221E [∞] infinity + '∠' => "\xe2\x88\xa0", #U+2220 [∠] angle + '∧' => "\xe2\x88\xa7", #U+2227 [∧] logical and = wedge + '∨' => "\xe2\x88\xa8", #U+2228 [∨] logical or = vee + '∩' => "\xe2\x88\xa9", #U+2229 [∩] intersection = cap + '∪' => "\xe2\x88\xaa", #U+222A [∪] union = cup + '∫' => "\xe2\x88\xab", #U+222B [∫] integral + '∴' => "\xe2\x88\xb4", #U+2234 [∴] therefore + '∼' => "\xe2\x88\xbc", #U+223C [∼] tilde operator = varies with = similar to + '≅' => "\xe2\x89\x85", #U+2245 [≅] approximately equal to + '≈' => "\xe2\x89\x88", #U+2248 [≈] almost equal to = asymptotic to + '≠' => "\xe2\x89\xa0", #U+2260 [≠] not equal to + '≡' => "\xe2\x89\xa1", #U+2261 [≡] identical to + '≤' => "\xe2\x89\xa4", #U+2264 [≤] less-than or equal to + '≥' => "\xe2\x89\xa5", #U+2265 [≥] greater-than or equal to + '⊂' => "\xe2\x8a\x82", #U+2282 [⊂] subset of + '⊃' => "\xe2\x8a\x83", #U+2283 [⊃] superset of + '⊄' => "\xe2\x8a\x84", #U+2284 [⊄] not a subset of + '⊆' => "\xe2\x8a\x86", #U+2286 [⊆] subset of or equal to + '⊇' => "\xe2\x8a\x87", #U+2287 [⊇] superset of or equal to + '⊕' => "\xe2\x8a\x95", #U+2295 [⊕] circled plus = direct sum + '⊗' => "\xe2\x8a\x97", #U+2297 [⊗] circled times = vector product + '⊥' => "\xe2\x8a\xa5", #U+22A5 [⊥] up tack = orthogonal to = perpendicular + '⋅' => "\xe2\x8b\x85", #U+22C5 [⋅] dot operator + '⌈' => "\xe2\x8c\x88", #U+2308 [⌈] left ceiling = APL upstile + '⌉' => "\xe2\x8c\x89", #U+2309 [⌉] right ceiling + '⌊' => "\xe2\x8c\x8a", #U+230A [⌊] left floor = APL downstile + '⌋' => "\xe2\x8c\x8b", #U+230B [⌋] right floor + '⟨' => "\xe2\x8c\xa9", #U+2329 [〈] left-pointing angle bracket = bra + '⟩' => "\xe2\x8c\xaa", #U+232A [〉] right-pointing angle bracket = ket + '◊' => "\xe2\x97\x8a", #U+25CA [◊] lozenge + '♠' => "\xe2\x99\xa0", #U+2660 [♠] black spade suit + '♣' => "\xe2\x99\xa3", #U+2663 [♣] black club suit = shamrock + '♥' => "\xe2\x99\xa5", #U+2665 [♥] black heart suit = valentine + '♦' => "\xe2\x99\xa6", #U+2666 [♦] black diamond suit #Other Special Characters: - 'Œ' => "\xc5\x92", #U+0152 [Œ] Latin capital ligature OE - 'œ' => "\xc5\x93", #U+0153 [œ] Latin small ligature oe + 'Œ' => "\xc5\x92", #U+0152 [Œ] Latin capital ligature OE + 'œ' => "\xc5\x93", #U+0153 [œ] Latin small ligature oe 'Š' => "\xc5\xa0", #U+0160 [Š] Latin capital letter S with caron 'š' => "\xc5\xa1", #U+0161 [š] Latin small letter s with caron - 'Ÿ' => "\xc5\xb8", #U+0178 [Ÿ] Latin capital letter Y with diaeresis - 'ˆ' => "\xcb\x86", #U+02C6 [ˆ] modifier letter circumflex accent - '˜' => "\xcb\x9c", #U+02DC [˜] small tilde - ' ' => "\xe2\x80\x82", #U+2002 [ ] en space - ' ' => "\xe2\x80\x83", #U+2003 [ ] em space + 'Ÿ' => "\xc5\xb8", #U+0178 [Ÿ] Latin capital letter Y with diaeresis + 'ˆ' => "\xcb\x86", #U+02C6 [ˆ] modifier letter circumflex accent + '˜' => "\xcb\x9c", #U+02DC [˜] small tilde + ' ' => "\xe2\x80\x82", #U+2002 [ ] en space + ' ' => "\xe2\x80\x83", #U+2003 [ ] em space ' ' => "\xe2\x80\x89", #U+2009 [ ] thin space - '‌' => "\xe2\x80\x8c", #U+200C [‌] zero width non-joiner - '‍' => "\xe2\x80\x8d", #U+200D [‍] zero width joiner - '‎' => "\xe2\x80\x8e", #U+200E [‎] left-to-right mark - '‏' => "\xe2\x80\x8f", #U+200F [‏] right-to-left mark - '–' => "\xe2\x80\x93", #U+2013 [–] en dash - '—' => "\xe2\x80\x94", #U+2014 [—] em dash - '‘' => "\xe2\x80\x98", #U+2018 [‘] left single quotation mark - '’' => "\xe2\x80\x99", #U+2019 [’] right single quotation mark (and apostrophe!) - '‚' => "\xe2\x80\x9a", #U+201A [‚] single low-9 quotation mark - '“' => "\xe2\x80\x9c", #U+201C [“] left double quotation mark - '”' => "\xe2\x80\x9d", #U+201D [”] right double quotation mark - '„' => "\xe2\x80\x9e", #U+201E [„] double low-9 quotation mark + '‌' => "\xe2\x80\x8c", #U+200C [‌] zero width non-joiner + '‍' => "\xe2\x80\x8d", #U+200D [‍] zero width joiner + '‎' => "\xe2\x80\x8e", #U+200E [‎] left-to-right mark + '‏' => "\xe2\x80\x8f", #U+200F [‏] right-to-left mark + '–' => "\xe2\x80\x93", #U+2013 [–] en dash + '—' => "\xe2\x80\x94", #U+2014 [—] em dash + '‘' => "\xe2\x80\x98", #U+2018 [‘] left single quotation mark + '’' => "\xe2\x80\x99", #U+2019 [’] right single quotation mark (and apostrophe!) + '‚' => "\xe2\x80\x9a", #U+201A [‚] single low-9 quotation mark + '“' => "\xe2\x80\x9c", #U+201C [“] left double quotation mark + '”' => "\xe2\x80\x9d", #U+201D [”] right double quotation mark + '„' => "\xe2\x80\x9e", #U+201E [„] double low-9 quotation mark '†' => "\xe2\x80\xa0", #U+2020 [†] dagger '‡' => "\xe2\x80\xa1", #U+2021 [‡] double dagger '‰' => "\xe2\x80\xb0", #U+2030 [‰] per mille sign '‹' => "\xe2\x80\xb9", #U+2039 [‹] single left-pointing angle quotation mark '›' => "\xe2\x80\xba", #U+203A [›] single right-pointing angle quotation mark - '€' => "\xe2\x82\xac", #U+20AC [€] euro sign + '€' => "\xe2\x82\xac", #U+20AC [€] euro sign ); /** @@ -381,6 +506,7 @@ class UTF8 * * @link http://search.cpan.org/CPAN/authors/id/A/AM/AMICHAUER/Lingua-TT-Yanalif-0.08.tar.gz * @link http://www.unicode.org/charts/PDF/U0400.pdf + * @var array */ public static $cp1259_table = array( #bytes from 0x00 to 0x7F (ASCII) saved as is @@ -521,6 +647,7 @@ class UTF8 * lower case letter in UTF-8 * * @author Andreas Gohr + * @var array */ public static $convert_case_table = array( #CASE_UPPER => case_lower @@ -1181,8 +1308,12 @@ class UTF8 "\xef\xbc\xba" => "\xef\xbd\x9a", ); - #Unicode Character Database 6.0.0 (2010-06-04) - #autogenerated by unicode_blocks_txt2php() PHP function at 2011-06-04 00:19:39, 209 blocks total + /** + * Unicode Character Database 6.0.0 (2010-06-04) + * Autogenerated by unicode_blocks_txt2php() PHP function at 2011-06-04 00:19:39, 209 blocks total + * + * @var array + */ public static $unicode_blocks = array( 'Basic Latin' => array( 0 => 0x0000, @@ -2232,30 +2363,30 @@ class UTF8 ); #calling the methods of this class only statically! - private function __construct() {} + private function __construct() + { + } /** * Remove combining diactrical marks, with possibility of the restore * Удаляет диакритические знаки в тексте, с возможностью восстановления (опция) * - * @param string|null $s - * @param array|null $additional_chars for example: "\xc2\xad" #soft hyphen = discretionary hyphen - * @param bool $is_can_restored - * @param array|null &$restore_table + * @param string|null $s + * @param array|null $additional_chars for example: "\xc2\xad" #soft hyphen = discretionary hyphen + * @param bool $is_can_restored + * @param array|null &$restore_table * @return string|bool|null Returns FALSE if error occurred */ public static function diactrical_remove($s, $additional_chars = null, $is_can_restored = false, &$restore_table = null) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s) || $s === '') return $s; - if ($additional_chars) - { + if ($additional_chars) { foreach ($additional_chars as $k => &$v) $v = preg_quote($v, '/'); - $re = '/((?>' . self::$diactrical_re . '|' . implode('|', $additional_chars) . ')+)/sxSX'; - } - else $re = '/((?>' . self::$diactrical_re . ')+)/sxSX'; - if (! $is_can_restored) return preg_replace($re, '', $s); + $re = '/((?>' . self::DIACTRICAL_RE . '|' . implode('|', $additional_chars) . ')+)/sxSX'; + } else $re = '/((?>' . self::DIACTRICAL_RE . ')+)/sxSX'; + if (!$is_can_restored) return preg_replace($re, '', $s); $restore_table = array(); $a = preg_split($re, $s, -1, PREG_SPLIT_DELIM_CAPTURE); @@ -2263,8 +2394,7 @@ class UTF8 if ($c === 1) return $s; $pos = 0; $s2 = ''; - for ($i = 0; $i < $c - 1; $i += 2) - { + for ($i = 0; $i < $c - 1; $i += 2) { $s2 .= $a[$i]; #запоминаем символьные (не байтовые!) позиции $pos += self::strlen($a[$i]); @@ -2279,25 +2409,24 @@ class UTF8 * In Russian: * Восстанавливает диакритические знаки в тексте, при условии, что их символьные позиции и кол-во символов не изменились! * - * @see self::diactrical_remove() - * @param string|null $s - * @param array $restore_table + * @param string|null $s + * @param array $restore_table * @return string|bool|null Returns FALSE if error occurred (broken $restore_table) + * @see self::diactrical_remove() */ public static function diactrical_restore($s, array $restore_table) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s) || $s === '') return $s; - if (! $restore_table) return $s; - if (! is_int(@$restore_table['length']) || - ! is_array(@$restore_table['offsets']) || + if (!$restore_table) return $s; + if (!is_int(@$restore_table['length']) || + !is_array(@$restore_table['offsets']) || $restore_table['length'] !== self::strlen($s)) return false; $a = array(); $length = $offset = 0; $s2 = ''; - foreach ($restore_table['offsets'] as $pos => $diactricals) - { + foreach ($restore_table['offsets'] as $pos => $diactricals) { $length = $pos - $offset; $s2 .= self::substr($s, $offset, $length) . $diactricals; $offset = $pos; @@ -2308,26 +2437,28 @@ class UTF8 /** * Encodes data from another character encoding to UTF-8. * - * @param array|scalar|null $data - * @param string $charset + * @param array|scalar|null $data + * @param string $charset * @return array|scalar|null Returns FALSE if error occurred */ public static function convert_from($data, $charset = 'cp1251') { - if (! ReflectionTypeHint::isValid()) return false; + if (!ReflectionTypeHint::isValid()) return false; + $charset = strtoupper($charset); return self::_convert($data, $charset, 'UTF-8'); } /** * Encodes data from UTF-8 to another character encoding. * - * @param array|scalar|null $data - * @param string $charset + * @param array|scalar|null $data + * @param string $charset * @return array|scalar|null Returns FALSE if error occurred */ public static function convert_to($data, $charset = 'cp1251') { - if (! ReflectionTypeHint::isValid()) return false; + if (!ReflectionTypeHint::isValid()) return false; + $charset = strtoupper($charset); return self::_convert($data, 'UTF-8', $charset); } @@ -2335,32 +2466,31 @@ class UTF8 * Recoding the data of any structure to/from UTF-8. * Arrays traversed recursively, recoded keys and values. * - * @see mb_encoding_aliases() - * @param array|scalar|null $data - * @param string $charset_from - * @param string $charset_to + * @param array|scalar|null $data + * @param string $charset_from + * @param string $charset_to * @return array|scalar|null Returns FALSE if error occurred + * @see mb_encoding_aliases() */ private static function _convert($data, $charset_from, $charset_to) { - if (! ReflectionTypeHint::isValid()) return false; #for recursive calls - if ($charset_from === $charset_to) return $data; - if (is_array($data)) - { + if (!ReflectionTypeHint::isValid()) return false; #for recursive calls + if ($charset_from === $charset_to) return $data; #speed improve + if (is_array($data)) { $d = array(); - foreach ($data as $k => &$v) - { - $k = self::_convert($k, $charset_from, $charset_to); - if ($k === false) return false; + foreach ($data as $k => &$v) { + if (is_string($k)) { + $k = self::_convert($k, $charset_from, $charset_to); + if (!is_string($k)) return false; + } $d[$k] = self::_convert($v, $charset_from, $charset_to); - if ($d[$k] === false && ! is_bool($v)) return false; + if ($d[$k] === false && !is_bool($v)) return false; } return $d; } - if (is_string($data)) - { + if (is_string($data)) { #smart behaviour for errors protected + speed improve - if ($charset_from === 'UTF-8' && ! self::is_utf8($data)) return $data; + if ($charset_from === 'UTF-8' && !self::is_utf8($data)) return $data; if ($charset_to === 'UTF-8' && self::is_utf8($data)) return $data; #since PHP-5.3.x iconv() faster then mb_convert_encoding() @@ -2368,19 +2498,20 @@ class UTF8 if (function_exists('mb_convert_encoding')) return mb_convert_encoding($data, $charset_to, $charset_from); #charset_from - if ($charset_from === 'UTF-16' || $charset_from === 'UCS-2') return self::_convert_from_utf16($data); - if ($charset_from === 'cp1251' || $charset_from === 'cp1259') return strtr($data, self::$cp1259_table); - if ($charset_from === 'koi8-r' || $charset_from === 'KOI8-R') return strtr(convert_cyr_string($data, 'k', 'w'), self::$cp1259_table); - if ($charset_from === 'iso8859-5') return strtr(convert_cyr_string($data, 'i', 'w'), self::$cp1259_table); - if ($charset_from === 'cp866') return strtr(convert_cyr_string($data, 'a', 'w'), self::$cp1259_table); - if ($charset_from === 'mac-cyrillic') return strtr(convert_cyr_string($data, 'm', 'w'), self::$cp1259_table); + if ($charset_from === 'ISO-8859-1') return utf8_encode($data); + if ($charset_from === 'UTF-16' || $charset_from === 'UCS-2') return self::_convert_from_utf16($data); + if ($charset_from === 'CP1251' || $charset_from === 'CP1259') return strtr($data, self::$cp1259_table); + if ($charset_from === 'KOI8-R') return strtr(convert_cyr_string($data, 'k', 'w'), self::$cp1259_table); + if ($charset_from === 'ISO-8859-5') return strtr(convert_cyr_string($data, 'i', 'w'), self::$cp1259_table); + if ($charset_from === 'CP866') return strtr(convert_cyr_string($data, 'a', 'w'), self::$cp1259_table); + if ($charset_from === 'MAC-CYRILLIC') return strtr(convert_cyr_string($data, 'm', 'w'), self::$cp1259_table); #charset_to - if ($charset_to === 'cp1251' || $charset_to === 'cp1259') return strtr($data, array_flip(self::$cp1259_table)); + if ($charset_to === 'ISO-8859-1') return utf8_decode($data); + if ($charset_to === 'CP1251' || $charset_to === 'CP1259') return strtr($data, array_flip(self::$cp1259_table)); #last trying - if (function_exists('recode_string')) - { + if (function_exists('recode_string')) { $s = @recode_string($charset_from . '..' . $charset_to, $data); if (is_string($s)) return $s; } @@ -2388,7 +2519,8 @@ class UTF8 trigger_error('Convert "' . $charset_from . '" --> "' . $charset_to . '" is not supported native, "iconv" or "mbstring" extension required', E_USER_WARNING); return false; } - return $data; + if (is_scalar($data) || is_null($data)) return $data; #~ null, integer, float, boolean + return false; #object or resource } /** @@ -2399,10 +2531,10 @@ class UTF8 * Преобразует строку из кодировки UTF-16 / UCS-2 в UTF-8. * Суррогаты UTF-16 поддерживаются! * - * @param string $s - * @param string $type 'BE' -- big endian byte order + * @param string $s + * @param string $type 'BE' -- big endian byte order * 'LE' -- little endian byte order - * @param bool $to_array returns array chars instead whole string? + * @param bool $to_array returns array chars instead whole string? * @return string|array|bool UTF-8 string, array chars or FALSE if error occurred */ private static function _convert_from_utf16($s, $type = 'BE', $to_array = false) @@ -2411,67 +2543,61 @@ class UTF8 'BE' => 'n', #unsigned short (always 16 bit, big endian byte order) 'LE' => 'v', #unsigned short (always 16 bit, little endian byte order) ); - if (! array_key_exists($type, $types)) - { + if (!array_key_exists($type, $types)) { trigger_error('Unexpected value in 2-nd parameter, "' . $type . '" given!', E_USER_WARNING); return false; } #the fastest way: - if (function_exists('iconv') || function_exists('mb_convert_encoding')) - { - if (function_exists('iconv')) $s = iconv('UTF-16' . $type, 'UTF-8', $s); + if (function_exists('iconv') || function_exists('mb_convert_encoding')) { + if (function_exists('iconv')) $s = iconv('UTF-16' . $type, 'UTF-8', $s); elseif (function_exists('mb_convert_encoding')) $s = mb_convert_encoding($s, 'UTF-8', 'UTF-16' . $type); - if (! $to_array) return $s; + if (!$to_array) return $s; return self::str_split($s); } /* - http://en.wikipedia.org/wiki/UTF-16 + http://en.wikipedia.org/wiki/UTF-16 - The improvement that UTF-16 made over UCS-2 is its ability to encode - characters in planes 1-16, not just those in plane 0 (BMP). + The improvement that UTF-16 made over UCS-2 is its ability to encode + characters in planes 1-16, not just those in plane 0 (BMP). - UTF-16 represents non-BMP characters (those from U+10000 through U+10FFFF) - using a pair of 16-bit words, known as a surrogate pair. - First 1000016 is subtracted from the code point to give a 20-bit value. - This is then split into two separate 10-bit values each of which is represented - as a surrogate with the most significant half placed in the first surrogate. - To allow safe use of simple word-oriented string processing, separate ranges - of values are used for the two surrogates: 0xD800-0xDBFF for the first, most - significant surrogate and 0xDC00-0xDFFF for the second, least significant surrogate. + UTF-16 represents non-BMP characters (those from U+10000 through U+10FFFF) + using a pair of 16-bit words, known as a surrogate pair. + First 1000016 is subtracted from the code point to give a 20-bit value. + This is then split into two separate 10-bit values each of which is represented + as a surrogate with the most significant half placed in the first surrogate. + To allow safe use of simple word-oriented string processing, separate ranges + of values are used for the two surrogates: 0xD800-0xDBFF for the first, most + significant surrogate and 0xDC00-0xDFFF for the second, least significant surrogate. - For example, the character at code point U+10000 becomes the code unit sequence 0xD800 0xDC00, - and the character at U+10FFFD, the upper limit of Unicode, becomes the sequence 0xDBFF 0xDFFD. - Unicode and ISO/IEC 10646 do not, and will never, assign characters to any of the code points - in the U+D800-U+DFFF range, so an individual code value from a surrogate pair does not ever - represent a character. + For example, the character at code point U+10000 becomes the code unit sequence 0xD800 0xDC00, + and the character at U+10FFFD, the upper limit of Unicode, becomes the sequence 0xDBFF 0xDFFD. + Unicode and ISO/IEC 10646 do not, and will never, assign characters to any of the code points + in the U+D800-U+DFFF range, so an individual code value from a surrogate pair does not ever + represent a character. - http://www.russellcottrell.com/greek/utilities/SurrogatePairCalculator.htm - http://www.russellcottrell.com/greek/utilities/UnicodeRanges.htm + http://www.russellcottrell.com/greek/utilities/SurrogatePairCalculator.htm + http://www.russellcottrell.com/greek/utilities/UnicodeRanges.htm - Conversion of a Unicode scalar value S to a surrogate pair : - H = Math.floor((S - 0x10000) / 0x400) + 0xD800; - L = ((S - 0x10000) % 0x400) + 0xDC00; - The conversion of a surrogate pair to a scalar value: - N = ((H - 0xD800) * 0x400) + (L - 0xDC00) + 0x10000; + Conversion of a Unicode scalar value S to a surrogate pair : + H = Math.floor((S - 0x10000) / 0x400) + 0xD800; + L = ((S - 0x10000) % 0x400) + 0xDC00; + The conversion of a surrogate pair to a scalar value: + N = ((H - 0xD800) * 0x400) + (L - 0xDC00) + 0x10000; */ $a = array(); $hi = false; - foreach (unpack($types[$type] . '*', $s) as $codepoint) - { + foreach (unpack($types[$type] . '*', $s) as $codepoint) { #surrogate process - if ($hi !== false) - { + if ($hi !== false) { $lo = $codepoint; if ($lo < 0xDC00 || $lo > 0xDFFF) $a[] = "\xEF\xBF\xBD"; #U+FFFD REPLACEMENT CHARACTER (for broken char) - else - { + else { $codepoint = (($hi - 0xD800) * 0x400) + ($lo - 0xDC00) + 0x10000; $a[] = self::chr($codepoint); } $hi = false; - } - elseif ($codepoint < 0xD800 || $codepoint > 0xDBFF) $a[] = self::chr($codepoint); #not surrogate + } elseif ($codepoint < 0xD800 || $codepoint > 0xDBFF) $a[] = self::chr($codepoint); #not surrogate else $hi = $codepoint; #surrogate was found } return $to_array ? $a : implode('', $a); @@ -2480,42 +2606,67 @@ class UTF8 /** * Strips out device control codes in the ASCII range. * - * @param string|null String to clean - * @return string|bool|null Returns FALSE if error occurred + * @param array|scalar|null Data to clean + * @return array|scalar|null Returns FALSE if error occurred */ - public static function strict($s) + public static function strict($data) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; - return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F]+/sSX', '', $s); + if (!ReflectionTypeHint::isValid()) return false; + if (is_array($data)) { + $d = array(); + foreach ($data as $k => &$v) { + if (is_string($k)) { + $k = self::strict($k); + if (!is_string($k)) return false; + } + $d[$k] = self::strict($v); + if ($d[$k] === false && !is_bool($v)) return false; + } + return $d; + } + if (is_string($data)) return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F]+/sSX', '', $data); + if (is_scalar($data) || is_null($data)) return $data; #int/float/bool/null + return false; #object or resource + } + + /** + * Check the data accessory to the class of control characters in ASCII. + * For non string always returns FALSE. + * + * @param scalar|null $data + * @param int|null $found_char_offset Returns the offset for the first found binary symbol + * @return bool + */ + public static function has_binary($data, &$found_char_offset = null) + { + if (!ReflectionTypeHint::isValid()) return false; + #[\t\n\r] = [\x09\x0a\x0d] + #[\x00-\x1f\x7f](? &$v) - { - if (! self::is_ascii($k) || ! self::is_ascii($v)) return false; - } - return true; + if (!ReflectionTypeHint::isValid()) return false; + if (is_string($data)) { + if (!preg_match('~[\x80-\xff]~sSX', $data, $m, PREG_OFFSET_CAPTURE)) return true; + $error_char_offset = $m[0][1]; + return false; } - #ltrim() little faster then preg_match() - #if (is_string($data)) return preg_match('/^[\x00-\x7f]*$/sSX', $data); #deprecated - if (is_string($data)) return ltrim($data, "\x00..\x7f") === ''; - if (is_scalar($data) || is_null($data)) return true; #~ null, integer, float, boolean - return false; #object or resource + if (is_int($data) || is_float($data)) return true; + return false; } /** @@ -2529,48 +2680,55 @@ class UTF8 * @link http://ru3.php.net/mb_detect_encoding * @link http://webtest.philigon.ru/articles/utf8/ * @link http://unicode.coeurlumiere.com/ - * @param array|scalar|null $data - * @param bool $is_strict strict the range of ASCII? + * @param array|scalar|null $data + * @param bool $is_strict strict the range of ASCII? * @return bool */ public static function is_utf8($data, $is_strict = true) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_array($data)) - { - foreach ($data as $k => &$v) - { - if (! self::is_utf8($k, $is_strict) || ! self::is_utf8($v, $is_strict)) return false; + if (!ReflectionTypeHint::isValid()) return false; + if (is_string($data)) { + if (preg_match('~~suSX', $data) !== 1) return false; + //if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return false; + //preg_match('~~suSX') much faster (up to 4 times), then mb_check_encoding($data, 'UTF-8')! + //if (function_exists('mb_check_encoding') && ! mb_check_encoding($data, 'UTF-8')) return false; #DEPRECATED + /** + * Специальные символы по спецификации JSON (http://json.org/) + * \b represents the backspace character (U+0008) + * \t represents the character tabulation character (U+0009) + * \n represents the line feed character (U+000A) + * \f represents the form feed character (U+000C) + * \r represents the carriage return character (U+000D) + */ + //с данным регулярным выражением preg_match() работает в 2 раза быстрее, чем strpbrk() + if ($is_strict && preg_match('/[^\x08\x09\x0A\x0C\x0D\x20-\xBF\xC2-\xF7]/sSX', $data)) { + return false; } return true; } - if (is_string($data)) - { - if (! preg_match('~~suSX', $data)) return false; - if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return false; - #preg_match('~~suSX') much faster (up to 4 times), then mb_check_encoding($data, 'UTF-8')! - #if (function_exists('mb_check_encoding') && ! mb_check_encoding($data, 'UTF-8')) return false; #DEPRECATED - if ($is_strict && preg_match('/[^\x09\x0A\x0D\x20-\xBF\xC2-\xF7]/sSX', $data)) return false; + if (is_scalar($data) || is_null($data)) return true; #int/float/bool/null + if (is_array($data)) { + foreach ($data as $k => &$v) { + if (!self::is_utf8($k, $is_strict) || !self::is_utf8($v, $is_strict)) return false; + } return true; } - if (is_scalar($data) || is_null($data)) return true; #~ null, integer, float, boolean return false; #object or resource } /** * Tries to detect if a string is in Unicode encoding * - * @deprecated Slowly, use self::is_utf8() instead - * @see self::is_utf8() - * @param string $s текст - * @param bool $is_strict строгая проверка диапазона ASCII? + * @param string $s текст + * @param bool $is_strict строгая проверка диапазона ASCII? * @return bool + * @see self::is_utf8() + * @deprecated Slowly, use self::is_utf8() instead */ public static function check($s, $is_strict = true) { - if (! ReflectionTypeHint::isValid()) return false; - for ($i = 0, $len = strlen($s); $i < $len; $i++) - { + if (!ReflectionTypeHint::isValid()) return false; + for ($i = 0, $len = strlen($s); $i < $len; $i++) { $c = ord($s[$i]); if ($c < 0x80) #1 byte 0bbbbbbb { @@ -2583,10 +2741,9 @@ class UTF8 elseif (($c & 0xFE) == 0xFC) $n = 5; #6 bytes 1111110b 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb else return false; #does not match any model #n bytes matching 10bbbbbb follow ? - for ($j = 0; $j < $n; $j++) - { + for ($j = 0; $j < $n; $j++) { $i++; - if ($i == $len || ((ord($s[$i]) & 0xC0) != 0x80) ) return false; + if ($i == $len || ((ord($s[$i]) & 0xC0) != 0x80)) return false; } } return true; @@ -2601,6 +2758,11 @@ class UTF8 * Arrays traversed recursively (keys and values). * At least if one array element value is not passed checking, it returns FALSE. * + * @param array|scalar|null $data + * @param array|string $blocks + * @return bool Возвращает TRUE, если все символы из текста принадлежат указанным диапазонам + * и FALSE в противном случае или для разбитого UTF-8. + * @link http://www.unicode.org/charts/ * @example * #A simple check the standard named ranges: * UTF8::blocks_check('поисковые системы Google и Yandex', array('Basic Latin', 'Cyrillic')); @@ -2612,167 +2774,59 @@ class UTF8 * 'Arrows', * )); * - * @link http://www.unicode.org/charts/ - * @param array|scalar|null $data - * @param array|string $blocks - * @return bool Возвращает TRUE, если все символы из текста принадлежат указанным диапазонам - * и FALSE в противном случае или для разбитого UTF-8. */ public static function blocks_check($data, $blocks) { - if (! ReflectionTypeHint::isValid()) return false; + if (!ReflectionTypeHint::isValid()) return false; - if (is_array($data)) - { - foreach ($data as $k => &$v) - { - if (! self::blocks_check($k, $blocks) || ! self::blocks_check($v, $blocks)) return false; + if (is_array($data)) { + foreach ($data as $k => &$v) { + if (!self::blocks_check($k, $blocks) || !self::blocks_check($v, $blocks)) return false; } return true; } - if (is_string($data)) - { - $chars = self::str_split($data); - if ($chars === false) return false; #broken UTF-8 - unset($data); #memory free - $skip = array(); #save to cache already checked symbols - foreach ($chars as $i => $char) - { - if (array_key_exists($char, $skip)) continue; #speed improve - $codepoint = self::ord($char); - if ($codepoint === false) return false; #broken UTF-8 - $is_valid = false; - $blocks = (array)$blocks; - foreach ($blocks as $j => $block) - { - if (is_string($block)) - { - if (! array_key_exists($block, self::$unicode_blocks)) - { - trigger_error('Unknown block "' . $block . '"!', E_USER_WARNING); - return false; - } - list ($min, $max) = self::$unicode_blocks[$block]; - } - elseif (is_array($block)) list ($min, $max) = $block; - elseif (is_int($block)) $min = $max = $block; - else trigger_error('A string/array/int type expected for block[' . $j . ']!', E_USER_ERROR); - if ($codepoint >= $min && $codepoint <= $max) - { - $is_valid = true; - break; - } - }#foreach - if (! $is_valid) return false; - $skip[$char] = null; - }#foreach - return true; - } - if (is_scalar($data) || is_null($data)) return true; #~ null, integer, float, boolean - return false; #object or resource - } + if (is_int($data)) $data = strval($data); + elseif (is_float($data)) $data = str_replace(',', '.', strval($data)); + elseif (!is_string($data)) return false; - /** - * Recode $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES from $charset encoding to UTF-8, if necessary. - * A side effect is a positive protection against XSS attacks with non-printable characters on the vulnerable PHP function. - * Thus web forms can be sent to the server in 2-encoding: $charset and UTF-8. - * For example: ?тест[тест]=тест - * - * Алгоритм работы: - * 1) Функция проверяет массивы $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES - * на корректность значений элементов кодировке UTF-8. - * 2) Значения не в UTF-8 принимаются как $charset и конвертируется в UTF-8, - * при этом байты от 0x00 до 0x7F (ASCII) сохраняются как есть. - * 3) Сконвертированные значения снова проверяются. - * Если данные опять не в кодировке UTF-8, то они считаются разбитыми и функция возвращает FALSE. - * - * NOTICE - * Функция должна вызываться после self::unescape_request()! - * - * @see self::unescape_request() - * @param bool $is_hex2bin Декодировать HEX-данные? - * Пример: 0xd09ec2a0d0bad0bed0bcd0bfd0b0d0bdd0b8d0b8 => О компании - * Параметры в URL адресах иногда бывает удобно кодировать не функцией rawurlencode(), - * а использовать следующий механизм (к тому же кодирующий данные более компактно): - * '0x' . bin2hex($string) - * @param string $charset - * @return bool Возвращает TRUE, если все значения элементов массивов в кодировке UTF-8 - * и FALSE + E_USER_WARNING в противном случае. - */ - public static function autoconvert_request($is_hex2bin = false, $charset = 'cp1251') - { - if (! ReflectionTypeHint::isValid()) return false; - $is_converted = false; - $is_broken = false; - foreach (array('_GET', '_POST', '_COOKIE', '_FILES') as $k => $v) - { - if (! array_key_exists($v, $GLOBALS)) continue; - #использовать array_walk_recursive() не предоставляется возможным, - #т.к. его callback функция не поддерживает передачу ключа по ссылке - $GLOBALS[$v] = self::_autoconvert_request_recursive($GLOBALS[$v], $is_converted, $is_broken, $is_hex2bin, $charset); - if ($is_broken) - { - trigger_error('Array $' . $v . ' does not have keys/values in UTF-8 charset!', E_USER_WARNING); - return false; + $chars = self::str_split($data); + if ($chars === false) return false; #broken UTF-8 + unset($data); #memory free + $skip = array(); #save to cache already checked symbols + foreach ($chars as $i => $char) { + if (array_key_exists($char, $skip)) continue; #speed improve + $codepoint = self::ord($char); + if (!is_int($codepoint)) return false; #broken UTF-8? + $is_valid = false; + $blocks = (array)$blocks; + foreach ($blocks as $j => $block) { + if (is_string($block)) { + if (!array_key_exists($block, self::$unicode_blocks)) { + trigger_error('Unknown block "' . $block . '"!', E_USER_WARNING); + return false; + } + list ($min, $max) = self::$unicode_blocks[$block]; + } elseif (is_array($block)) list ($min, $max) = $block; + elseif (is_int($block)) $min = $max = $block; + else trigger_error('A string/array/int type expected for block[' . $j . ']!', E_USER_ERROR); + if ($codepoint >= $min && $codepoint <= $max) { + $is_valid = true; + break; + } } - } - if ($is_converted) - { - $_REQUEST = - (isset($_COOKIE) ? $_COOKIE : array()) + - (isset($_POST) ? $_POST : array()) + - (isset($_GET) ? $_GET : array()); + if (!$is_valid) return false; + $skip[$char] = null; } return true; } - private static function _autoconvert_request_recursive(&$data, &$is_converted, &$is_broken, $is_hex2bin, $charset) - { - if ($is_broken) return $data; #speed improve - if (is_array($data)) - { - $d = array(); - foreach ($data as $k => &$v) - { - $k = self::_autoconvert_request($k, $is_converted, $is_broken, $is_hex2bin, $charset); - if ($is_broken) return $data; #speed improve - $d[$k] = self::_autoconvert_request_recursive($v, $is_converted, $is_broken, $is_hex2bin, $charset); - if ($is_broken) return $data; #speed improve - } - return $d; - } - return self::_autoconvert_request($data, $is_converted, $is_broken, $is_hex2bin, $charset); - } - - private static function _autoconvert_request(&$s, &$is_converted, &$is_broken, $is_hex2bin, $charset) - { - #regexp speed improve by using strpos() - if ($is_hex2bin && strpos($s, '0x') === 0 && preg_match('/^0x((?:[\da-fA-F]{2})+)$/sSX', $s, $m)) - { - $s = pack('H' . strlen($m[1]), $m[1]); #hex2bin() - $is_converted = true; - } - if (! self::is_utf8($s)) - { - $s = self::convert_from($s, $charset); - if ($s === false) $is_broken = true; - elseif (! self::is_utf8($s)) - { - trigger_error('String 0x ' . substr(bin2hex($s), 0, 100) . '... is not UTF-8!', E_USER_WARNING); - $is_broken = true; - } - else $is_converted = true; - } - return $s; - } - /** * Сравнение строк * - * @param string|null $s1 - * @param string|null $s2 - * @param string $locale For example, 'en_CA', 'ru_RU' + * @param string|null $s1 + * @param string|null $s2 + * @param string $locale For example, 'en_CA', 'ru_RU' * @return int|bool|null Returns FALSE if error occurred * Returns < 0 if $s1 is less than $s2; * > 0 if $s1 is greater than $s2; @@ -2780,14 +2834,13 @@ class UTF8 */ public static function strcmp($s1, $s2, $locale = '') { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s1) || is_null($s2)) return null; - if (! function_exists('collator_create')) return strcmp($s1, $s2); + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s1) || !is_string($s2)) return null; + if (!function_exists('collator_create')) return strcmp($s1, $s2); # PHP 5 >= 5.3.0, PECL intl >= 1.0.0 # If empty string ("") or "root" are passed, UCA rules will be used. $c = new Collator($locale); - if (! $c) - { + if (!$c) { # Returns an "empty" object on error. You can use intl_get_error_code() and/or intl_get_error_message() to know what happened. trigger_error(intl_get_error_message(), E_USER_WARNING); return false; @@ -2798,9 +2851,9 @@ class UTF8 /** * Сравнение строк для N первых символов * - * @param string|null $s1 - * @param string|null $s2 - * @param int $length + * @param string|null $s1 + * @param string|null $s2 + * @param int $length * @return int|bool|null Returns FALSE if error occurred * Returns < 0 if $s1 is less than $s2; * > 0 if $s1 is greater than $s2; @@ -2808,16 +2861,16 @@ class UTF8 */ public static function strncmp($s1, $s2, $length) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s1) || is_null($s2)) return null; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s1) || !is_string($s2)) return null; return self::strcmp(self::substr($s1, 0, $length), self::substr($s2, 0, $length)); } /** * Implementation strcasecmp() function for UTF-8 encoding string. * - * @param string|null $s1 - * @param string|null $s2 + * @param string|null $s1 + * @param string|null $s2 * @return int|bool|null Returns FALSE if error occurred * Returns < 0 if $s1 is less than $s2; * > 0 if $s1 is greater than $s2; @@ -2825,22 +2878,22 @@ class UTF8 */ public static function strcasecmp($s1, $s2) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s1) || is_null($s2)) return null; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s1) || !is_string($s2)) return null; return self::strcmp(self::lowercase($s1), self::lowercase($s2)); } /** * Converts a UTF-8 string to a UNICODE codepoints * - * @param string|null $s UTF-8 string + * @param string|null $s UTF-8 string * @return array|bool|null Unicode codepoints * Returns FALSE if $s broken (not UTF-8) */ public static function to_unicode($s) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s) || $s === '') return $s; $s2 = null; #since PHP-5.3.x iconv() little faster then mb_convert_encoding() @@ -2850,35 +2903,37 @@ class UTF8 if ($s2 !== null) return false; $a = self::str_split($s); - if ($a === false) return false; + if (!is_array($a)) return false; return array_map(array(__CLASS__, 'ord'), $a); } /** * Converts a UNICODE codepoints to a UTF-8 string * - * @param array|null $a Unicode codepoints + * @param array|null $a Unicode codepoints * @return string|bool|null UTF-8 string * Returns FALSE if error occurred */ public static function from_unicode($a) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($a)) return $a; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_array($a)) return $a; #since PHP-5.3.x iconv() little faster then mb_convert_encoding() - if (function_exists('iconv')) - { - array_walk($a, function(&$cp) { $cp = pack('N', $cp); }); + if (function_exists('iconv')) { + array_walk($a, function (&$cp) { + $cp = pack('N', $cp); + }); $s = @iconv('UCS-4BE', 'UTF-8', implode('', $a)); - if (! is_string($s)) return false; + if (!is_string($s)) return false; return $s; } - if (function_exists('mb_convert_encoding')) - { - array_walk($a, function(&$cp) { $cp = pack('N', $cp); }); + if (function_exists('mb_convert_encoding')) { + array_walk($a, function (&$cp) { + $cp = pack('N', $cp); + }); $s = mb_convert_encoding(implode('', $a), 'UTF-8', 'UCS-4BE'); - if (! is_string($s)) return false; + if (!is_string($s)) return false; return $s; } @@ -2888,30 +2943,33 @@ class UTF8 /** * Converts a UTF-8 character to a UNICODE codepoint * - * @param string|null $char UTF-8 character + * @param string|null $char UTF-8 character * @return int|bool|null Unicode codepoint * Returns FALSE if $char broken (not UTF-8) */ public static function ord($char) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($char)) return $char; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($char)) return $char; static $cache = array(); if (array_key_exists($char, $cache)) return $cache[$char]; #speed improve - switch (strlen($char)) - { - case 1 : return $cache[$char] = ord($char); - case 2 : return $cache[$char] = (ord($char{1}) & 63) | - ((ord($char{0}) & 31) << 6); - case 3 : return $cache[$char] = (ord($char{2}) & 63) | - ((ord($char{1}) & 63) << 6) | - ((ord($char{0}) & 15) << 12); - case 4 : return $cache[$char] = (ord($char{3}) & 63) | - ((ord($char{2}) & 63) << 6) | - ((ord($char{1}) & 63) << 12) | - ((ord($char{0}) & 7) << 18); + switch (strlen($char)) { + case 1 : + return $cache[$char] = ord($char); + case 2 : + return $cache[$char] = (ord($char{1}) & 63) | + ((ord($char{0}) & 31) << 6); + case 3 : + return $cache[$char] = (ord($char{2}) & 63) | + ((ord($char{1}) & 63) << 6) | + ((ord($char{0}) & 15) << 12); + case 4 : + return $cache[$char] = (ord($char{3}) & 63) | + ((ord($char{2}) & 63) << 6) | + ((ord($char{1}) & 63) << 12) | + ((ord($char{0}) & 7) << 18); default : trigger_error('Character 0x' . bin2hex($char) . ' is not UTF-8!', E_USER_WARNING); return false; @@ -2921,28 +2979,28 @@ class UTF8 /** * Converts a UNICODE codepoint to a UTF-8 character * - * @param int|digit|null $cp Unicode codepoint + * @param int|digit|null $cp Unicode codepoint * @return string|bool|null UTF-8 character * Returns FALSE if error occurred */ public static function chr($cp) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($cp)) return $cp; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_int($cp) && !ctype_digit($cp)) return $cp; static $cache = array(); if (array_key_exists($cp, $cache)) return $cache[$cp]; #speed improve - if ($cp <= 0x7f) return $cache[$cp] = chr($cp); - if ($cp <= 0x7ff) return $cache[$cp] = chr(0xc0 | ($cp >> 6)) . - chr(0x80 | ($cp & 0x3f)); - if ($cp <= 0xffff) return $cache[$cp] = chr(0xe0 | ($cp >> 12)) . - chr(0x80 | (($cp >> 6) & 0x3f)) . - chr(0x80 | ($cp & 0x3f)); + if ($cp <= 0x7f) return $cache[$cp] = chr($cp); + if ($cp <= 0x7ff) return $cache[$cp] = chr(0xc0 | ($cp >> 6)) . + chr(0x80 | ($cp & 0x3f)); + if ($cp <= 0xffff) return $cache[$cp] = chr(0xe0 | ($cp >> 12)) . + chr(0x80 | (($cp >> 6) & 0x3f)) . + chr(0x80 | ($cp & 0x3f)); if ($cp <= 0x10ffff) return $cache[$cp] = chr(0xf0 | ($cp >> 18)) . - chr(0x80 | (($cp >> 12) & 0x3f)) . - chr(0x80 | (($cp >> 6) & 0x3f)) . - chr(0x80 | ($cp & 0x3f)); + chr(0x80 | (($cp >> 12) & 0x3f)) . + chr(0x80 | (($cp >> 6) & 0x3f)) . + chr(0x80 | ($cp & 0x3f)); #U+FFFD REPLACEMENT CHARACTER return $cache[$cp] = "\xEF\xBF\xBD"; } @@ -2950,43 +3008,49 @@ class UTF8 /** * Implementation chunk_split() function for UTF-8 encoding string. * - * @param string|null $s - * @param int|digit|null $length - * @param string|null $glue + * @param string|null $s + * @param int|digit|null $length + * @param string|null $glue * @return string|bool|null Returns FALSE if error occurred */ public static function chunk_split($s, $length = null, $glue = null) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s) || $s === '') return $s; $length = intval($length); - $glue = strval($glue); + $glue = strval($glue); if ($length < 1) $length = 76; if ($glue === '') $glue = "\r\n"; - if (! is_array($a = self::str_split($s, $length))) return false; + $a = self::str_split($s, $length); + if (!is_array($a)) return false; return implode($glue, $a); } /** * Changes all keys in an array * - * @param array|null $a - * @param int $mode {CASE_LOWER|CASE_UPPER} + * @param array|null $a + * @param int $mode {CASE_LOWER|CASE_UPPER} + * @param bool $is_recursive * @return array|bool|null Returns FALSE if error occurred */ - public static function array_change_key_case($a, $mode) + public static function array_change_key_case($a, $mode, $is_recursive = false) { - if (! ReflectionTypeHint::isValid()) return false; - if (! is_array($a)) return $a; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_array($a)) return $a; + $a2 = array(); - foreach ($a as $k => $v) - { - if (is_string($k)) - { + foreach ($a as $k => $v) { + if (is_string($k)) { $k = self::convert_case($k, $mode); if ($k === false) return false; } + if ($is_recursive && is_array($v)) #recursive support + { + $v = self::array_change_key_case($v, $mode, $is_recursive); + if (!is_array($v)) return false; + } $a2[$k] = $v; } return $a2; @@ -2998,34 +3062,35 @@ class UTF8 * в элементах массива, а ключи остаются без изменений. * Для конвертирования только ключей используйте метод self::array_change_key_case(). * - * @see self::array_change_key_case() + * @param array|scalar|null $data Данные произвольной структуры + * @param int $mode {CASE_LOWER|CASE_UPPER} + * @param bool $is_ascii_optimization for speed improve + * @return scalar|bool|null Returns FALSE if error occurred * @link http://www.unicode.org/charts/PDF/U0400.pdf * @link http://ru.wikipedia.org/wiki/ISO_639-1 - * @param array|scalar|null $data Данные произвольной структуры - * @param int $mode {CASE_LOWER|CASE_UPPER} - * @param bool $is_ascii_optimization for speed improve - * @return scalar|bool|null Returns FALSE if error occurred + * @see self::array_change_key_case() */ public static function convert_case($data, $mode, $is_ascii_optimization = true) { - if (! ReflectionTypeHint::isValid()) return false; + if (!ReflectionTypeHint::isValid()) return false; - if (is_array($data)) + if (is_array($data)) #recursive support { - foreach ($data as $k => &$v) $v = self::convert_case($v, $mode); + foreach ($data as $k => $v) { + $data[$k] = self::convert_case($v, $mode); + if ($data[$k] === false && !is_bool($v)) return false; + } return $data; } - if (! is_string($data) || ! $data) return $data; + if (!is_string($data) || !$data) return $data; - if ($mode === CASE_UPPER) - { + if ($mode === CASE_UPPER) { if ($is_ascii_optimization && self::is_ascii($data)) return strtoupper($data); #speed improve! #deprecated, since PHP-5.3.x strtr() 2-3 times faster then mb_strtolower() #if (function_exists('mb_strtoupper')) return mb_strtoupper($data, 'utf-8'); return strtr($data, array_flip(self::$convert_case_table)); } - if ($mode === CASE_LOWER) - { + if ($mode === CASE_LOWER) { if ($is_ascii_optimization && self::is_ascii($data)) return strtolower($data); #speed improve! #deprecated, since PHP-5.3.x strtr() 2-3 times faster then mb_strtolower() #if (function_exists('mb_strtolower')) return mb_strtolower($data, 'utf-8'); @@ -3038,50 +3103,52 @@ class UTF8 /** * Convert a data to lower case * - * @param array|scalar|null $data - * @return scalar|bool|null Returns FALSE if error occurred */ + * @param array|scalar|null $data + * @return scalar|bool|null Returns FALSE if error occurred + */ public static function lowercase($data) { - if (! ReflectionTypeHint::isValid()) return false; + if (!ReflectionTypeHint::isValid()) return false; return self::convert_case($data, CASE_LOWER); } /** * Convert a data to upper case * - * @param array|scalar|null $data + * @param array|scalar|null $data * @return scalar|null Returns FALSE if error occurred */ public static function uppercase($data) { - if (! ReflectionTypeHint::isValid()) return false; + if (!ReflectionTypeHint::isValid()) return false; return self::convert_case($data, CASE_UPPER); } /** * Convert a data to lower case * - * @param array|scalar|null $data + * @param array|scalar|null $data * @return scalar|bool|null Returns FALSE if error occurred */ public static function strtolower($data) { - if (! ReflectionTypeHint::isValid()) return false; + if (!ReflectionTypeHint::isValid()) return false; return self::convert_case($data, CASE_LOWER); } /** * Convert a data to upper case * - * @param array|scalar|null $data + * @param array|scalar|null $data * @return scalar|null Returns FALSE if error occurred */ public static function strtoupper($data) { - if (! ReflectionTypeHint::isValid()) return false; + if (!ReflectionTypeHint::isValid()) return false; return self::convert_case($data, CASE_UPPER); } + /** * Convert all HTML entities to native UTF-8 characters * Функция декодирует гораздо больше именованных сущностей, чем стандартная html_entity_decode() @@ -3089,35 +3156,38 @@ class UTF8 * * Example: '"' or '"' or '"' will be converted to '"'. * - * @link http://www.htmlhelp.com/reference/html40/entities/ - * @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References) - * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true - * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true - * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true + * @link http://www.htmlhelp.com/reference/html40/entities/ + * @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References) + * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true + * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true + * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true * - * @param scalar|null $s - * @param bool $is_special_chars Дополнительно обрабатывать специальные html сущности? (< > & ") + * @param scalar|null $s + * @param bool $is_special_chars Дополнительно обрабатывать специальные html сущности? (< > & " ') * @return scalar|null Returns FALSE if error occurred */ public static function html_entity_decode($s, $is_special_chars = false) { - if (! ReflectionTypeHint::isValid()) return false; - if (! is_string($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s) || $s === '') return $s; #speed improve if (strlen($s) < 4 #по минимальной длине сущности - 4 байта: &#d; &xx; || ($pos = strpos($s, '&') === false) || strpos($s, ';', $pos) === false) return $s; $table = self::$html_entity_table; - if ($is_special_chars) $table += self::$html_special_chars_table; - + if ($is_special_chars) { + $table += self::$html_special_chars_table + + array( + #' entity is only available in XHTML/HTML5 and not in plain HTML, see http://www.w3.org/TR/xhtml1/#C_16 + ''' => "\x27", #U+0027 ['] ' apostrophe + ); + } #replace named entities $s = strtr($s, $table); #block below deprecated, since PHP-5.3.x strtr() 1.5 times faster - if (0 && preg_match_all('/&[a-zA-Z]++\d*+;/sSX', $s, $m, null, $pos)) - { - foreach (array_unique($m[0]) as $entity) - { + if (0 && preg_match_all('/&[a-zA-Z]++\d*+;/sSX', $s, $m, null, $pos)) { + foreach (array_unique($m[0]) as $entity) { if (array_key_exists($entity, $table)) $s = str_replace($entity, $table[$entity], $s); } } @@ -3128,16 +3198,14 @@ class UTF8 $class = __CLASS__; $html_special_chars_table_flipped = array_flip(self::$html_special_chars_table); $s = preg_replace_callback('/&#((x)[\da-fA-F]{1,6}+|\d{1,7}+);/sSX', - function (array $m) use ($class, $html_special_chars_table_flipped, $is_special_chars) - { - $codepoint = isset($m[2]) && $m[2] === 'x' ? hexdec($m[1]) : $m[1]; - if (! $is_special_chars) - { - $char = pack('C', $codepoint); - if (array_key_exists($char, $html_special_chars_table_flipped)) return $html_special_chars_table_flipped[$char]; - } - return $class::chr($codepoint); - }, $s); + function (array $m) use ($class, $html_special_chars_table_flipped, $is_special_chars) { + $codepoint = isset($m[2]) && $m[2] === 'x' ? hexdec($m[1]) : $m[1]; + if (!$is_special_chars) { + $char = pack('C', $codepoint); + if (array_key_exists($char, $html_special_chars_table_flipped)) return $html_special_chars_table_flipped[$char]; + } + return $class::chr($codepoint); + }, $s); } return $s; } @@ -3152,29 +3220,28 @@ class UTF8 * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true * @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true * - * @param scalar|null $s - * @param bool $is_special_chars_only Обрабатывать только специальные html сущности? (< > & ") + * @param scalar|null $s + * @param bool $is_special_chars_only Обрабатывать только специальные html сущности? (< > & ") * @return scalar|null Returns FALSE if error occurred */ public static function html_entity_encode($s, $is_special_chars_only = false) { - if (! ReflectionTypeHint::isValid()) return false; - if (! is_string($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s) || $s === '') return $s; - #if ($is_special_chars_only) return strtr($s, array_flip(self::$html_special_chars_table)); - if ($is_special_chars_only) return htmlspecialchars($s); + if ($is_special_chars_only) return strtr($s, array_flip(self::$html_special_chars_table)); #binary support + #if ($is_special_chars_only) return htmlspecialchars($s); #DEPRECATED, charset dependent #replace UTF-8 chars to named entities: $s = strtr($s, array_flip(self::$html_entity_table)); + #block below deprecated, since PHP-5.3.x strtr() 3 times faster if (0 && preg_match_all('~(?> [\xc2\xc3\xc5\xc6\xcb\xce\xcf][\x80-\xbf] #2 bytes | \xe2[\x80-\x99][\x82-\xac] #3 bytes ) - ~sxSX', $s, $m)) - { + ~sxSX', $s, $m)) { $table = array_flip(self::$html_entity_table); - foreach (array_unique($m[0]) as $char) - { + foreach (array_unique($m[0]) as $char) { if (array_key_exists($char, $table)) $s = str_replace($char, $table[$char], $s); } } @@ -3184,38 +3251,45 @@ class UTF8 /** * Make regular expression for case insensitive match - * Example (non ASCII): "123_слово_test" => "123_(с|С)(л|Л)(о|О)(в|В)(о|О)_[tT][eE][sS][tT]" - * Example (only ASCII): "123_test" => "(?i:123_test)" + * Example (only digits): "123" => "123" + * Example (only ASCII): "123_test" => "(?i:123_test)" + * Example (upper ASCII): "123_слово_test" => "123_(с|С)(л|Л)(о|О)(в|В)(о|О)_[tT][eE][sS][tT]" * - * @param string $s - * @param string|null $delimiter If the optional delimiter is specified, it will also be escaped. + * @param string|null $s + * @param string|null $delimiter If the optional delimiter is specified, it will also be escaped. * This is useful for escaping the delimiter that is required by the PCRE functions. * The / is the most commonly used delimiter. * @return string|bool|null Returns FALSE if error occurred */ public static function preg_quote_case_insensitive($s, $delimiter = null) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s) || $s === '') return $s; + if (ctype_digit($s)) return preg_quote($s, $delimiter); #speed improve if (self::is_ascii($s)) return '(?i:' . preg_quote($s, $delimiter) . ')'; #speed improve + $s_lc = self::convert_case($s, CASE_LOWER, false); + if ($s_lc === false) return false; + $s_uc = self::convert_case($s, CASE_UPPER, false); + if ($s_uc === false) return false; + if ($s_lc === $s_uc) return preg_quote($s, $delimiter); #speed improve + + $chars_lc = self::str_split($s_lc); + if ($chars_lc === false) return false; + $chars_uc = self::str_split($s_uc); + if ($chars_uc === false) return false; + $s_re = ''; - $s_lc = UTF8::lowercase($s); if ($s_lc === false) return false; - $s_uc = UTF8::uppercase($s); if ($s_uc === false) return false; - - $chars_lc = UTF8::str_split($s_lc); if ($chars_lc === false) return false; - $chars_uc = UTF8::str_split($s_uc); if ($chars_uc === false) return false; - - foreach ($chars_lc as $i => $char) - { + foreach ($chars_lc as $i => $char) { if ($chars_lc[$i] === $chars_uc[$i]) $s_re .= preg_quote($chars_lc[$i], $delimiter); - elseif (self::is_ascii($chars_lc[$i])) - $s_re .= '[' . preg_quote($chars_lc[$i] . $chars_uc[$i], $delimiter) . ']'; + elseif (strlen($chars_lc[$i]) === 1 /*self::is_ascii($chars_lc[$i])*/) + $s_re .= '[' . self::_preg_quote_class($chars_lc[$i] . $chars_uc[$i], $delimiter) . ']'; else + #для русских и др. букв, т. к. флаг /u и (?i:слово) не помогают :( $s_re .= '(' . preg_quote($chars_lc[$i], $delimiter) . '|' - . preg_quote($chars_uc[$i], $delimiter) . ')'; + . preg_quote($chars_uc[$i], $delimiter) . ')'; } return $s_re; } @@ -3226,27 +3300,25 @@ class UTF8 * * @link http://bolknote.ru/2010/09/08/~2704 * - * @param string $pattern - * @param string|null $subject - * @param array $matches - * @param int $flags - * @param int $char_offset + * @param string $pattern + * @param string|null $subject + * @param array $matches + * @param int $flags + * @param int $char_offset * @return array|bool|null Returns FALSE if error occurred */ public static function preg_match_all($pattern, $subject, &$matches, $flags = PREG_PATTERN_ORDER, $char_offset = 0) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($subject)) return null; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($subject)) return $subject; $byte_offset = ($char_offset > 0) ? strlen(self::substr($subject, 0, $char_offset)) : $char_offset; $return = preg_match_all($pattern, $subject, $matches, $flags, $byte_offset); if ($return === false) return false; - if ($flags & PREG_OFFSET_CAPTURE) - { - foreach ($matches as &$match) - { + if ($flags & PREG_OFFSET_CAPTURE) { + foreach ($matches as &$match) { foreach ($match as &$a) $a[1] = self::strlen(substr($subject, 0, $a[1])); } } @@ -3265,45 +3337,36 @@ class UTF8 * причём последнее слово показывается целиком, а не обрывается на середине. * Html сущности корректно обрабатываются. * - * @param string|null $s Текст в кодировке UTF-8 - * @param int|null|digit $maxlength Ограничение длины текста - * @param string $continue Завершающая строка, которая будет вставлена после текста, если он обрежется - * @param bool|null &$is_cutted Текст был обрезан? - * @param int|digit $tail_min_length Если длина "хвоста", оставшегося после обрезки текста, меньше $tail_min_length, + * @param string|null $s Текст в кодировке UTF-8 + * @param int|null|digit $maxlength Ограничение длины текста + * @param string $continue Завершающая строка, которая будет вставлена после текста, если он обрежется + * @param bool|null &$is_cutted Текст был обрезан? + * @param int|digit $tail_min_length Если длина "хвоста", оставшегося после обрезки текста, меньше $tail_min_length, * то текст возвращается без изменений * @return string|bool|null Returns FALSE if error occurred */ public static function str_limit($s, $maxlength = null, $continue = "\xe2\x80\xa6", &$is_cutted = null, $tail_min_length = 20) #"\xe2\x80\xa6" = "…" { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s) || $s === '') return $s; $is_cutted = false; if ($continue === null) $continue = "\xe2\x80\xa6"; - if (! $maxlength) $maxlength = 256; + if (!$maxlength) $maxlength = 256; #speed improve block #{{{ if (strlen($s) <= $maxlength) return $s; $s2 = str_replace("\r\n", '?', $s); - $s2 = preg_replace('/&(?> [a-zA-Z][a-zA-Z\d]+ - | \#(?> \d{1,4} - | x[\da-fA-F]{2,4} - ) - ); # html сущности (< > & ") - /sxSX', '?', $s2); + $s2 = preg_replace('~' . self::HTML_ENTITY_RE . '~sxSX', '?', $s2); if (strlen($s2) <= $maxlength || self::strlen($s2) <= $maxlength) return $s; #}}} - $r = preg_match_all('/(?> \r\n # переносы строк - | &(?> [a-zA-Z][a-zA-Z\d]+ - | \#(?> \d{1,4} - | x[\da-fA-F]{2,4} - ) - ); # html сущности (< > & ") + $r = preg_match_all('~(?> \r\n # next line + | ' . self::HTML_ENTITY_RE . ' | . ) - /sxuSX', $s, $m); + ~sxuSX', $s, $m); if ($r === false) return false; #d($m); @@ -3314,18 +3377,18 @@ class UTF8 #нельзя вырезать в конце строки символ ";", т.к. он используются в сущностях &xxx; $left2 = rtrim($left, "\x00..\x28\x2A..\x2F\x3A\x3C..\x3E\x40\x5B\x5C\x5E..\x60\x7B\x7C\x7E\x7F"); if (strlen($left) !== strlen($left2)) $return = $left2 . $continue; - else - { + else { #добавляем остаток к обрезанному слову $right = implode('', array_slice($m[0], $maxlength)); - preg_match('/^(?> [\d\)\]\}\-\.:]+ #цифры, закрывающие парные символы, дефис для составных слов, дата, время, IP-адреса, URL типа www.ya.ru:80! - | \p{L}+ #буквы - | \xe2\x80\x9d #закрывающие кавычки - | \xe2\x80\x99 #закрывающие кавычки - | \xe2\x80\x9c #закрывающие кавычки - | \xc2\xbb #закрывающие кавычки - )+ - /suxSX', $right, $m); + preg_match('/^(?> + #цифры, закрывающие парные символы, дефис для составных слов, дата, время, IP-адреса, URL типа www.ya.ru:80! + [\d\)\]\}\-\.:]+ + #letters + | \p{L}+ + #quotation marks + | [' . implode('', self::$html_quotation_mark_table) . ']+ + )+ + /suxSX', $right, $m); #d($m); $right = isset($m[0]) ? rtrim($m[0], '.-') : ''; $return = $left . $right; @@ -3340,14 +3403,14 @@ class UTF8 /** * Implementation str_split() function for UTF-8 encoding string. * - * @param string|null $s - * @param int|null|digit $length + * @param string|null $s + * @param int|null|digit $length * @return array|bool|null Returns FALSE if error occurred */ public static function str_split($s, $length = null) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s)) return $s; $length = ($length === null) ? 1 : intval($length); if ($length < 1) return false; @@ -3355,8 +3418,7 @@ class UTF8 if (preg_match_all('~.~suSX', $s, $m) === false) return false; if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return false; if ($length === 1) $a = $m[0]; - else - { + else { $a = array(); for ($i = 0, $c = count($m[0]); $i < $c; $i += $length) $a[] = implode('', array_slice($m[0], $i, $length)); } @@ -3366,62 +3428,63 @@ class UTF8 /** * Implementation strlen() function for UTF-8 encoding string. * - * @param string|null $s + * @param string|null $s * @return int|bool|null Returns FALSE if error occurred */ public static function strlen($s) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s)) return $s; //since PHP-5.3.x mb_strlen() faster then strlen(utf8_decode()) if (function_exists('mb_strlen')) return mb_strlen($s, 'utf-8'); /* - utf8_decode() converts characters that are not in ISO-8859-1 to '?', which, for the purpose of counting, is quite alright. - It's much faster than iconv_strlen() - Note: this function does not count bad UTF-8 bytes in the string - these are simply ignored + utf8_decode() converts characters that are not in ISO-8859-1 to '?', which, for the purpose of counting, is quite alright. + It's much faster than iconv_strlen() + Note: this function does not count bad UTF-8 bytes in the string - these are simply ignored */ return strlen(utf8_decode($s)); /* - #slowly then strlen(utf8_decode()) - if (function_exists('iconv_strlen')) return iconv_strlen($s, 'utf-8'); + #iconv_strlen() slowly then strlen(utf8_decode()) + if (function_exists('iconv_strlen')) return iconv_strlen($s, 'utf-8'); - #Do not count UTF-8 continuation bytes - #return strlen(preg_replace('/[\x80-\xBF]/sSX', '', $s)); + #Do not count UTF-8 continuation bytes + #return strlen(preg_replace('/[\x80-\xBF]/sSX', '', $s)); - #slowly then strlen(utf8_decode()) - preg_match_all('~.~suSX', $str, $m); - return count($m[0]); + #slowly then strlen(utf8_decode()) + preg_match_all('~.~suSX', $str, $m); + return count($m[0]); - #slowly then preg_match_all() + count() - $n = 0; - for ($i = 0, $len = strlen($s); $i < $len; $i++) - { - $c = ord(substr($s, $i, 1)); - if ($c < 0x80) $n++; #single-byte (0xxxxxx) - elseif (($c & 0xC0) == 0xC0) $n++; #multi-byte starting byte (11xxxxxx) - } - return $n; + #slowly then preg_match_all() + count() + $n = 0; + for ($i = 0, $len = strlen($s); $i < $len; $i++) + { + $c = ord(substr($s, $i, 1)); + if ($c < 0x80) $n++; #single-byte (0xxxxxx) + elseif (($c & 0xC0) == 0xC0) $n++; #multi-byte starting byte (11xxxxxx) + } + return $n; */ } /** * Implementation strpos() function for UTF-8 encoding string * - * @param string|null $s The entire string - * @param string|int $needle The searched substring - * @param int|null $offset The optional offset parameter specifies the position from which the search should be performed + * @param string|null $s The entire string + * @param string|int $needle The searched substring + * @param int|null $offset The optional offset parameter specifies the position from which the search should be performed * @return int|bool|null Returns the numeric position of the first occurrence of needle in haystack. * If needle is not found, will return FALSE. */ public static function strpos($s, $needle, $offset = null) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s)) return $s; if ($offset === null || $offset < 0) $offset = 0; + #mb_strpos() faster then iconv_strpos() if (function_exists('mb_strpos')) return mb_strpos($s, $needle, $offset, 'utf-8'); #iconv_strpos() deprecated, because slowly than self::strlen(substr()) #if (function_exists('iconv_strpos')) return iconv_strpos($s, $needle, $offset, 'utf-8'); @@ -3434,16 +3497,16 @@ class UTF8 /** * Find position of first occurrence of a case-insensitive string. * - * @param string|null $s The entire string - * @param string|int $needle The searched substring - * @param int|null $offset The optional offset parameter specifies the position from which the search should be performed + * @param string|null $s The entire string + * @param string|int $needle The searched substring + * @param int|null $offset The optional offset parameter specifies the position from which the search should be performed * @return int|bool|null Returns the numeric position of the first occurrence of needle in haystack. * If needle is not found, will return FALSE. */ public static function stripos($s, $needle, $offset = null) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s)) return $s; if ($offset === null || $offset < 0) $offset = 0; if (function_exists('mb_stripos')) return mb_stripos($s, $needle, $offset, 'utf-8'); @@ -3465,23 +3528,23 @@ class UTF8 /** * Implementation strrev() function for UTF-8 encoding string * - * @param string|null $s + * @param string|null $s * @return string|bool|null Returns FALSE if error occurred */ public static function strrev($s) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s) || $s === '') return $s; if (0) #TODO test speed { $s = self::_convert($s, 'UTF-8', 'UTF-32'); - if (! is_string($s)) return false; + if (!is_string($s)) return false; $s = implode('', array_reverse(str_split($s, 4))); return self::_convert($s, 'UTF-32', 'UTF-8'); } - if (! is_array($a = self::str_split($s))) return false; + if (!is_array($a = self::str_split($s))) return false; return implode('', array_reverse($a)); } @@ -3489,24 +3552,22 @@ class UTF8 * Implementation substr() function for UTF-8 encoding string. * * @link http://www.w3.org/International/questions/qa-forms-utf-8.html - * @param string|null $s - * @param int|digit $offset - * @param int|null|digit $length + * @param string|null $s + * @param int|digit $offset + * @param int|null|digit $length * @return string|bool|null Returns FALSE if error occurred */ public static function substr($s, $offset, $length = null) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s)) return $s; #since PHP-5.3.x mb_substr() faster then iconv_substr() - if (function_exists('mb_substr')) - { + if (function_exists('mb_substr')) { if ($length === null) $length = self::strlen($s); return mb_substr($s, $offset, $length, 'utf-8'); } - if (function_exists('iconv_substr')) - { + if (function_exists('iconv_substr')) { if ($length === null) $length = self::strlen($s); return iconv_substr($s, $offset, $length, 'utf-8'); } @@ -3515,7 +3576,7 @@ class UTF8 static $_a = null; if ($_s !== $s) $_a = self::str_split($_s = $s); - if (! is_array($_a)) return false; + if (!is_array($_a)) return false; if ($length !== null) $a = array_slice($_a, $offset, $length); else $a = array_slice($_a, $offset); return implode('', $a); @@ -3524,18 +3585,19 @@ class UTF8 /** * Implementation substr_replace() function for UTF-8 encoding string. * - * @param string|null $s - * @param string|int $replacement - * @param int|digit $start - * @param int|null $length + * @param string|null $s + * @param string|int $replacement + * @param int|digit $start + * @param int|null $length * @return string|bool|null Returns FALSE if error occurred */ public static function substr_replace($s, $replacement, $start, $length = null) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s) || $s === '') return $s; - if (! is_array($a = self::str_split($s))) return false; + $a = self::str_split($s); + if (!is_array($a)) return false; array_splice($a, $start, $length, $replacement); return implode('', $a); } @@ -3543,19 +3605,22 @@ class UTF8 /** * Implementation ucfirst() function for UTF-8 encoding string. * Преобразует первый символ строки в кодировке UTF-8 в верхний регистр. + * Корректно обрабатывает слова в кавычках, например: «северный поток» --> «Северный поток» * - * @param string|null $s - * @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр? + * @param string|null $s + * @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр? * @return string|bool|null Returns FALSE if error occurred */ public static function ucfirst($s, $is_other_to_lowercase = true) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if ($s === '' || !is_string($s)) return $s; - if ($s === '' || ! is_string($s)) return $s; - if (! preg_match('/^(.)(.*)$/suSX', $s, $m)) return false; - return self::uppercase($m[1]) . ($is_other_to_lowercase ? self::lowercase($m[2]) : $m[2]); + if (!preg_match('/^([' . implode('', self::$html_quotation_mark_table) . ']{1,2}+) #1 quotation marks + (\p{L}) #2 first letter + (.*+) #3 next letters + $/sxuSX', $s, $m)) return $s; #letters not found + return $m[1] . self::uppercase($m[2]) . ($is_other_to_lowercase ? self::lowercase($m[3]) : $m[3]); } /** @@ -3563,93 +3628,126 @@ class UTF8 * Преобразует в верхний регистр первый символ каждого слова в строке в кодировке UTF-8, * остальные символы каждого слова преобразуются в нижний регистр. * - * @param string|null $s - * @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр? - * @param string $spaces_re + * @param string|null $s + * @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр? + * @param string $spaces_re * @return string|bool|null Returns FALSE if error occurred */ - public static function ucwords($s, $is_other_to_lowercase = true, $spaces_re = '~([\pZ\s]+)~suSX') #\pXps is POSIX space: property Z or tab, NL, VT, FF, CR + public static function ucwords($s, $is_other_to_lowercase = true, $spaces_re = '~([\p{Z}\s]+)~suSX') { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if ($s === '' || !is_string($s)) return $s; $words = preg_split($spaces_re, $s, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); - foreach ($words as $k => $word) - { - $words[$k] = self::ucfirst($word, $is_other_to_lowercase = true); + foreach ($words as $k => $word) { + $words[$k] = self::ucfirst($word, $is_other_to_lowercase); if ($words[$k] === false) return false; } return implode('', $words); } /** - * Decodes a string in the format %uXXXX or %u{XXXXXX} in the UTF-8 string. + * Decodes a string to UTF-8 string from some formats (can be mixed) + * Examples + * '%D1%82%D0%B5%D1%81%D1%82' => "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" #binary (regular) + * '0xD182D0B5D181D182' => "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" #binary (compact) + * '%u0442%u0435%u0441%u0442' => "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" #UCS-2 (U+0 — U+FFFF) + * '%u{442}%u{435}%u{0441}%u{00442}' => "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" #UTF-8 (U+0 — U+FFFFFF) * - * Используется для декодирования данных типа "%u0442%u0435%u0441%u0442", - * закодированных устаревшей функцией javascript://encode(). - * Рекомендуется использовать функцию javascript://encodeURIComponent(). + * It is used to decode the data in the format %uXXXX, encoded deprecated + * javascript's function encode(). Recommended to use encodeURIComponent(). + * Obsolete format %uXXXX allows unicode only in the range of UCS-2, ie, U+0 to U+FFFF. * - * NOTICE - * Устаревший формат %uXXXX позволяет использовать юникод только из диапазона UCS-2, т.е. от U+0 до U+FFFF - * - * @param scalar|array|null $data - * @param bool $is_rawurlencode - * @return scalar|array|null Returns FALSE if error occurred + * @param array|scalar|null $data + * @param bool $is_hex2bin Decode the HEX-data? + * Example: '0xD182D0B5D181D182' => "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" + * Hint: parameters in the URL address is sometimes + * convenient to encode not function rawurlencode($string), + * and use the following mechanism (encoded data is more compact): + * '0x' . bin2hex($string) + * @param bool $is_urldecode + * @return array|scalar|null Returns FALSE if error occurred + * @see urldecode() */ - public static function unescape($data, $is_rawurlencode = false) + public static function unescape($data, $is_hex2bin = false, $is_urldecode = true) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_array($data)) - { + if (!ReflectionTypeHint::isValid()) return false; + if (is_array($data)) { $d = array(); - foreach ($data as $k => &$v) - { - $k = self::unescape($k, $is_rawurlencode); - if ($k === false) return false; - $d[$k] = self::unescape($v, $is_rawurlencode); - if ($d[$k] === false && ! is_bool($v)) return false; + foreach ($data as $k => &$v) { + if (is_string($k)) { + $k = self::unescape($k, $is_hex2bin, $is_urldecode); + if (!is_string($k)) return false; + } + $d[$k] = self::unescape($v, $is_hex2bin, $is_urldecode); + if ($d[$k] === false && !is_bool($v)) return false; } return $d; } - if (is_string($data)) - { - if (strpos($data, '%u') === false) return $data; #use strpos() for speed improving - return preg_replace_callback('/%u( [\da-fA-F]{4}+ #%uXXXX only UCS-2 - | \{ [\da-fA-F]{1,6}+ \} #%u{XXXXXX} extended form for all UNICODE charts - ) - /sxSX', - function (array $m) use ($is_rawurlencode) - { - $codepoint = hexdec(trim($m[1], '{}')); - $char = self::chr($codepoint); - return $is_rawurlencode ? rawurlencode($char) : $char; - }, - $data); + if (is_string($data)) { + #use strpos() for speed improving of regexp + if ($is_hex2bin && strpos($data, '0x') !== false) { + $data = preg_replace_callback( + '~0x((?:[\da-fA-F]{2})+)~sSX', + function (array $m) { + $s = pack('H' . strlen($m[1]), $m[1]); #hex2bin() + return rawurlencode($s); + }, + $data); + } + if (strpos($data, '%u') !== false) { + $class = __CLASS__; + $data = preg_replace_callback( + '~%u( [\da-fA-F]{4}+ #%uXXXX only UCS-2 + | \{ [\da-fA-F]{1,6}+ \} #%u{XXXXXX} extended form for all UNICODE charts + ) + ~sxSX', + function (array $m) use ($class) { + $codepoint = hexdec(trim($m[1], '{}')); + $char = $class::chr($codepoint); + return rawurlencode($char); + }, + $data); + } + return $is_urldecode ? urldecode($data) : $data; } if (is_scalar($data) || is_null($data)) return $data; #~ null, integer, float, boolean return false; #object or resource } /** - * 1) Corrects the global arrays $_GET, $_POST, $_COOKIE, $_REQUEST - * decoded values ​​in the format %uXXXX and %u{XXXXXX}, encoded, + * 1) Corrects the global arrays $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES + * decoded values ​​from %XX and extended %uXXXX / %u{XXXXXX} format, * for example, through an outdated javascript function escape(). * Standard PHP5 cannot do it. - * 2) If in the HTTP_COOKIE there are parameters with the same name, - * takes the last value, not the first, as in the QUERY_STRING. - * 3) Creates an array of $_POST for non-standard Content-Type, for example, "Content-Type: application/octet-stream". - * Standard PHP5 creates an array for "Content-Type: application/x-www-form-urlencoded" and "Content-Type: multipart/form-data". + * 2) Recode $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES from $charset + * encoding to UTF-8, if necessary. + * A side effect is a positive protection against XSS attacks with + * non-printable characters on the vulnerable PHP function. + * Thus web forms can be sent to the server in 2-encoding: $charset and UTF-8. + * For example: ?тест[тест]=тест + * 3) If in the HTTP_COOKIE there are parameters with the same name, + * takes the last value (as in the QUERY_STRING), not the first. + * 4) Creates an array of $_POST for non-standard Content-Type, for example, + * "Content-Type: application/octet-stream". Standard PHP5 creates + * an array for "Content-Type: application/x-www-form-urlencoded" + * and "Content-Type: multipart/form-data". + * + * Examples + * '%F2%E5%F1%F2' => 'тест' #CP1251 (regular) + * '0xF2E5F1F2' => 'тест' #CP1251 (compact) + * '%D1%82%D0%B5%D1%81%D1%82' => 'тест' #UTF-8 (regular) + * '0xD182D0B5D181D182' => 'тест' #UTF-8 (compact) + * '%u0442%u0435%u0441%u0442' => 'тест' #UCS-2 (U+0 — U+FFFF) + * '%u{442}%u{435}%u{0441}%u{00442}' => 'тест' #UTF-8 (U+0 — U+FFFFFF) * * Сессии, куки и независимая авторизация на поддоменах. * * ПРИМЕР 1 * У рабочего сайта http://domain.com появились поддомены. * Для кроссдоменной авторизации через механизм сессий имя хоста для COOKIE было изменено с "domain.com" на ".domain.com" - * В результате авторизация не работает. - * Помогает очистка COOKIE, но их принудительная очистка на тысячах пользовательских компьютеров проблематична. - * Проблема в следующем: если в HTTP_COOKIE есть параметры с одинаковым именем, то берётся последнее значение, - * а не первое, как в QUERY_STRING. - * Более подробное описание: + * В результате авторизация не работает. Решение: поменять имя сессии. + * Ещё помогает очистка COOKIE, но их принудительная очистка на тысячах пользовательских компьютеров проблематична. * PHP не правильно (?) обрабатывает заголовок HTTP_COOKIE, если там встречаются параметры с одинаковым именем, но разными значениями. * Пример запроса HTTP-заголовка клиентом: "Cookie: sid=chpgs2fiak-330mzqza; sid=cmz5tnp5zz-xlbbgqp" * В этом случае сервер берёт первое значение, а не последнее. @@ -3657,7 +3755,6 @@ class UTF8 * В HTTP_COOKIE два параметра с одинаковым именем могут появиться, если отправить клиенту следующие HTTP-заголовки: * "Set-Cookie: sid=chpgs2fiak-330mzqza; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=domain.com" (только domain.com) * "Set-Cookie: sid=cmz6uqorzv-1bn35110; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.domain.com" (domain.com и все его поддомены) - * Решение: поменять имя сессии. * * ПРИМЕР 2 * Есть рабочие сайты: http://domain.com (основной), http://admin.domain.com (админка), @@ -3666,48 +3763,58 @@ class UTF8 * Требуется сделать независимую кросс-доменную авторизацию для http://*.domain.com и http://*.dev.domain.com. * Для сохранения статуса авторизации будем использовать сессию, имя и значение которой пишется в COOKIE. * Т. к. домены http://*.dev.domain.com имеют пересечение с доменами http://*.domain.com, - * для независимой авторизации нужно использовать разные имена сессий. + * для независимой авторизации нужно использовать разные имена сессий! * Пример HTTP заголовков ответа сервера: * "Set-Cookie: sid=chpgs2fiak-330mzqza; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.domain.com" (.domain.com и все его поддомены) * "Set-Cookie: sid.dev=cmz6uqorzv-1bn35110; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.dev.domain.com" (dev.domain.com и все его поддомены) * * @link http://tools.ietf.org/html/rfc2965 RFC 2965 - HTTP State Management Mechanism - * @return void + * @param bool $is_hex2bin Decode the HEX-data? + * Example: '0xD182D0B5D181D182' => "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" + * Hint: parameters in the URL address is sometimes + * convenient to encode not function rawurlencode($string), + * and use the following mechanism (encoded data is more compact): + * '0x' . bin2hex($string) + * @param string $charset + * @return bool */ - public static function unescape_request() + public static function unescape_request($is_hex2bin = false, $charset = 'ISO-8859-1') { $fixed = false; - #ATTENTION! HTTP_RAW_POST_DATA is only accessible when Content-Type of POST request is NOT default "application/x-www-form-urlencoded"! + #ATTENTION! HTTP_RAW_POST_DATA is only accessible when Content-Type of POST request is NOT default "application/x-www-form-urlencoded"! $HTTP_RAW_POST_DATA = isset($_SERVER['REQUEST_METHOD']) && $_SERVER['REQUEST_METHOD'] === 'POST' ? (isset($GLOBALS['HTTP_RAW_POST_DATA']) ? $GLOBALS['HTTP_RAW_POST_DATA'] : @file_get_contents('php://input')) : null; if (ini_get('always_populate_raw_post_data')) $GLOBALS['HTTP_RAW_POST_DATA'] = $HTTP_RAW_POST_DATA; - foreach (array( '_GET' => isset($_SERVER['QUERY_STRING']) ? $_SERVER['QUERY_STRING'] : null, - '_POST' => $HTTP_RAW_POST_DATA, - '_COOKIE' => isset($_SERVER['HTTP_COOKIE']) ? $_SERVER['HTTP_COOKIE'] : null, - ) as $k => $v) - { - if (! is_string($v)) continue; - if ($k === '_COOKIE') - { + foreach (array('_GET' => isset($_SERVER['QUERY_STRING']) ? $_SERVER['QUERY_STRING'] : null, + '_POST' => $HTTP_RAW_POST_DATA, + '_COOKIE' => isset($_SERVER['HTTP_COOKIE']) ? $_SERVER['HTTP_COOKIE'] : null, + '_FILES' => isset($_FILES) ? $_FILES : null, + ) as $k => $v) { + if (!is_string($v)) continue; + + if ($k === '_COOKIE') { $v = preg_replace('/; *+/sSX', '&', $v); unset($_COOKIE); #будем парсить HTTP_COOKIE сами, чтобы сделать обработку как у QUERY_STRING } - if (strpos($v, '%u') !== false) - { - parse_str(self::unescape($v, $is_rawurlencode = true), $GLOBALS[$k]); - $fixed = true; - continue; - } - if (array_key_exists($k, $GLOBALS)) continue; + + $v = self::unescape($v, $is_hex2bin, false); + if ($v === false) return false; parse_str($v, $GLOBALS[$k]); + + $GLOBALS[$k] = self::convert_from($GLOBALS[$k], $charset); + if ($GLOBALS[$k] === false) { + trigger_error('Array $' . $k . ' does not have keys/values in UTF-8 charset!', E_USER_WARNING); + return false; + } + $fixed = true; } - if ($fixed) - { + if ($fixed) { $_REQUEST = (isset($_COOKIE) ? $_COOKIE : array()) + (isset($_POST) ? $_POST : array()) + (isset($_GET) ? $_GET : array()); } + return true; } /** @@ -3718,22 +3825,21 @@ class UTF8 * на следующую строку, высота м.б. меньше ожидаемой. * Этот алгоритм явл. простым (и быстрым) и не отслеживает переносы слов. * - * @param string|null $s Текст - * @param int|digit $cols Ширина области редактирования (колонок) - * @param int|digit $min_rows Минимальное кол-во строк - * @param int|digit $max_rows Максимальное кол-во строк + * @param string|null $s Текст + * @param int|digit $cols Ширина области редактирования (колонок) + * @param int|digit $min_rows Минимальное кол-во строк + * @param int|digit $max_rows Максимальное кол-во строк * @return int|bool|null Number of rows (lines) */ public static function textarea_rows($s, $cols, $min_rows = 3, $max_rows = 32) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s)) return $s; if (strlen($s) == 0) return $min_rows; #speed improve $rows = 0; #utf8_decode() converts characters that are not in ISO-8859-1 to '?' - foreach (preg_split('/\r\n|[\r\n]/sSX', utf8_decode($s)) as $line) - { + foreach (preg_split('/\r\n|[\r\n]/sSX', utf8_decode($s)) as $line) { $rows += ceil((strlen($line) + 1) / $cols); if ($rows > $max_rows) return $max_rows; } @@ -3741,69 +3847,74 @@ class UTF8 } /** - * @param string|null $s - * @param string|null $charlist + * @param string|null $s + * @param string|null $charlist * @return string|bool|null */ public static function ltrim($s, $charlist = null) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s) || $s === '') return $s; if ($charlist === null || self::is_ascii($charlist)) return ltrim($s); return preg_replace('~^[' . self::_preg_quote_class($charlist, '~') . ']+~suSX', '', $s); } /** - * @param string|null $s - * @param string|null $charlist + * @param string|null $s + * @param string|null $charlist * @return string|bool|null */ public static function rtrim($s, $charlist = null) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s) || $s === '') return $s; if ($charlist === null || self::is_ascii($charlist)) return rtrim($s); return preg_replace('~[' . self::_preg_quote_class($charlist, '~') . ']+$~suSX', '', $s); } /** - * @param scalar|null $s - * @param string|null $charlist + * @param scalar|null $s + * @param string|null $charlist * @return scalar|null */ public static function trim($s, $charlist = null) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s) || $s === '') return $s; if ($charlist === null || self::is_ascii($charlist)) return trim($s); $charlist_re = self::_preg_quote_class($charlist, '~'); $s = preg_replace('~^[' . $charlist_re . ']+~suSX', '', $s); return preg_replace('~[' . $charlist_re . ']+$~suSX', '', $s); } + /** + * @param string $charlist + * @param string|null $delimiter + * @return string + */ private static function _preg_quote_class($charlist, $delimiter = null) { #return preg_quote($charlist, $delimiter); #DEPRECATED $quote_table = array( '\\' => '\\\\', - '-' => '\-', - ']' => '\]', + '-' => '\-', + ']' => '\]', ); if (is_string($delimiter)) $quote_table[$delimiter] = '\\' . $delimiter; return strtr($charlist, $quote_table); } /** - * @param string|null $s - * @param int|digit $length - * @param string $pad_str - * @param int $type STR_PAD_LEFT, STR_PAD_RIGHT or STR_PAD_BOTH + * @param string|null $s + * @param int|digit $length + * @param string $pad_str + * @param int $type STR_PAD_LEFT, STR_PAD_RIGHT or STR_PAD_BOTH * @return string|bool|null */ public static function str_pad($s, $length, $pad_str = ' ', $type = STR_PAD_RIGHT) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s)) return $s; $input_len = self::strlen($s); if ($length <= $input_len) return $s; @@ -3811,27 +3922,24 @@ class UTF8 $pad_str_len = self::strlen($pad_str); $pad_len = $length - $input_len; - if ($type == STR_PAD_RIGHT) - { + if ($type == STR_PAD_RIGHT) { $repeat_num = ceil($pad_len / $pad_str_len); return self::substr($s . str_repeat($pad_str, $repeat_num), 0, $length); } - if ($type == STR_PAD_LEFT) - { + if ($type == STR_PAD_LEFT) { $repeat_num = ceil($pad_len / $pad_str_len); return self::substr(str_repeat($pad_str, $repeat_num), 0, intval(floor($pad_len))) . $s; } - if ($type == STR_PAD_BOTH) - { + if ($type == STR_PAD_BOTH) { $pad_len /= 2; - $pad_amount_left = intval(floor($pad_len)); + $pad_amount_left = intval(floor($pad_len)); $pad_amount_right = intval(ceil($pad_len)); - $repeat_times_left = ceil($pad_amount_left / $pad_str_len); + $repeat_times_left = ceil($pad_amount_left / $pad_str_len); $repeat_times_right = ceil($pad_amount_right / $pad_str_len); - $padding_left = self::substr(str_repeat($pad_str, $repeat_times_left), 0, $pad_amount_left); + $padding_left = self::substr(str_repeat($pad_str, $repeat_times_left), 0, $pad_amount_left); $padding_right = self::substr(str_repeat($pad_str, $repeat_times_right), 0, $pad_amount_right); return $padding_left . $s . $padding_right; } @@ -3841,15 +3949,15 @@ class UTF8 } /** - * @param string $str - * @param string $mask - * @param int|null $start - * @param int|null $length + * @param string $str + * @param string $mask + * @param int|null $start + * @param int|null $length * @return int|bool */ public static function strspn($str, $mask, $start = null, $length = null) { - if (! ReflectionTypeHint::isValid()) return false; + if (!ReflectionTypeHint::isValid()) return false; #if (self::is_ascii($str) && self::is_ascii($mask)) return strspn($str, $mask, $start, $length); if ($start !== null || $length !== null) $str = self::substr($str, $start, $length); if (preg_match('~^[' . preg_quote($mask, '~') . ']+~uSX', $str, $m)) self::strlen($m[0]); @@ -3862,15 +3970,15 @@ class UTF8 * So method works reliably enough. * * - * @param string $dir Директория для сканирования - * @param string|null $files_re Регул. выражение для шаблона имён файлов, + * @param string $dir Директория для сканирования + * @param string|null $files_re Регул. выражение для шаблона имён файлов, * например: '~\.(?:txt|sql|php|pl|py|sh|tpl|xml|xsl|html|xhtml|phtml|htm|js|json|css|conf|cfg|ini|htaccess)$~sSX' - * @param bool $is_recursive Обрабатывать вложенные папки и файлы? - * @param string $charset Исходная кодировка - * @param string|null $dirs_ignore_re Регул. выражение для исключения папок из обработки + * @param bool $is_recursive Обрабатывать вложенные папки и файлы? + * @param string $charset Исходная кодировка + * @param string|null $dirs_ignore_re Регул. выражение для исключения папок из обработки * например: '~^(?:cache|images?|photos?|fonts?|img|ico|\.svn|\.hg|\.cvs)$~siSX' - * @param bool $is_echo Печать имён обработанных файлов и статус обработки в выходной поток? - * @param bool $is_simulate Сымитировать работу без реальной перезаписи файлов? + * @param bool $is_echo Печать имён обработанных файлов и статус обработки в выходной поток? + * @param bool $is_simulate Сымитировать работу без реальной перезаписи файлов? * @return int|bool Возвращает кол-во перекодированных файлов * Returns FALSE if error occurred */ @@ -3878,75 +3986,74 @@ class UTF8 $dir, $files_re = null, $is_recursive = true, - $charset = 'cp1251', + $charset = 'CP1251', $dirs_ignore_re = null, $is_echo = false, $is_simulate = false) { - if (! ReflectionTypeHint::isValid()) return false; + if (!ReflectionTypeHint::isValid()) return false; $dh = opendir($dir); - if (! is_resource($dh)) return false; + if (!is_resource($dh)) return false; $counter = 0; - while (($name = readdir($dh)) !== false) - { + while (($name = readdir($dh)) !== false) { if ($name == '.' || $name == '..') continue; $file = $dir . '/' . $name; - if (is_file($file)) - { - if (is_string($files_re) && ! preg_match($files_re, $name)) continue; + if (is_file($file)) { + if (is_string($files_re) && !preg_match($files_re, $name)) continue; if ($is_echo) echo $file; + $s = @file_get_contents($file); - if (! is_string($s)) - { + if (!is_string($s)) { if ($is_echo) echo ' Error to reading' . PHP_EOL; return false; } - if (self::is_utf8($s)) - { - if ($is_echo) echo ' UTF-8' . PHP_EOL; + + if (self::is_utf8($s)) { + if ($is_echo) echo ' Already UTF-8, skipped' . PHP_EOL; continue; } - $s = self::_convert($s, $charset, 'UTF-8'); - #игнорируем ошибки при попытке перекодировать бинарные файлы - if (! is_string($s) || ! self::is_utf8($s)) - { - if ($is_echo) echo ' Binary' . PHP_EOL; + + if (self::has_binary($s)) { + if ($is_echo) echo ' Вinary file, skipped' . PHP_EOL; + continue; + } + + $s = self::convert_from($s, $charset); + if (!is_string($s) || !self::is_utf8($s)) { + if ($is_echo) echo ' Error to converting (source file not in ' . $charset . '?)' . PHP_EOL; continue; } $ext = strtolower(pathinfo($name, PATHINFO_EXTENSION)); - if ($ext === 'htm' || $ext === 'html' || $ext === 'xhtml' || $ext === 'phtml' || $ext === 'tpl') - { - $s = preg_replace('~(]* >) #2 + if ($ext === 'htm' || $ext === 'html' || $ext === 'xhtml' || $ext === 'phtml' || $ext === 'tpl') { + $s = preg_replace('~(]* >) #2 ~sixSX', '$1utf-8$2', $s); } - if ($ext === 'xml' || $ext === 'xsl' || $ext === 'tpl') - { - $s = preg_replace('~(<\?xml .+? encoding=") #1 - [-a-zA-Z\d]+ - (" .*? \?>) #2 + if ($ext === 'xml' || $ext === 'xsl' || $ext === 'tpl') { + $s = preg_replace('~(<\?xml [\x00-\x20]++ encoding=") #1 + [-a-z\d]++ #charset name + (" .*? \?>) #2 ~sixSX', '$1utf-8$2', $s); } - if (! $is_simulate) - { + if (!$is_simulate) { $bytes = @file_put_contents($file, $s); - if ($bytes === false) - { + if ($bytes === false) { if ($is_echo) echo ' Error to writing' . PHP_EOL; return false; } } - if ($is_echo) echo ' ' . $charset . ' -> UTF-8' . PHP_EOL; + if ($is_echo) echo ' ' . $charset . ' to UTF-8 converted' . PHP_EOL; $counter++; - } - elseif ($is_recursive && is_dir($file)) - { - if (! is_string($dirs_ignore_re) || ! preg_match($dirs_ignore_re, $name)) - { + } elseif ($is_recursive && is_dir($file)) { + if (!is_string($dirs_ignore_re) || !preg_match($dirs_ignore_re, $name)) { $c = self::convert_files_from($file, $files_re, $is_recursive, $charset, $dirs_ignore_re, $is_echo, $is_simulate); if ($c === false) return false; $counter += $c; @@ -3959,47 +4066,47 @@ class UTF8 /** * - * @param int|string $low - * @param int|string $high - * @param int $step + * @param int|string $low + * @param int|string $high + * @param int $step * @return array|bool Returns FALSE if error occurred */ public static function range($low, $high, $step = 1) { - if (! ReflectionTypeHint::isValid()) return false; + if (!ReflectionTypeHint::isValid()) return false; if (is_int($low) || is_int($high)) return range($low, $high, $step); #speed improve - $low_cp = self::ord($low); + $low_cp = self::ord($low); $high_cp = self::ord($high); - if ($low_cp === false || $high_cp === false) return false; + if (!is_int($low_cp) || !is_int($high_cp)) return false; $a = range($low_cp, $high_cp, $step); return array_map(array('self', 'chr'), $a); } /** * - * @param string|null $s - * @param string|array $from - * @param string|null $to + * @param string|null $s + * @param string|array $from + * @param string|null $to * @return string|bool|null Returns FALSE if error occurred */ public static function strtr($s, $from, $to = null) { - if (! ReflectionTypeHint::isValid()) return false; - if (is_null($s)) return $s; + if (!ReflectionTypeHint::isValid()) return false; + if (!is_string($s) || $s === '') return $s; if (is_array($from)) return strtr($s, $from); #speed improve - $keys = self::str_split($from); + $keys = self::str_split($from); $values = self::str_split($to); - if ($keys === false || $values === false) return false; + if (!is_array($keys) || !is_array($values)) return false; $table = array_combine($keys, $values); - if (! is_array($table)) return false; + if (!is_array($table)) return false; return strtr($s, $table); } public static function tests() { - assert_options(ASSERT_ACTIVE, true); - assert_options(ASSERT_BAIL, true); - assert_options(ASSERT_WARNING, true); + assert_options(ASSERT_ACTIVE, true); + assert_options(ASSERT_BAIL, true); + assert_options(ASSERT_WARNING, true); assert_options(ASSERT_QUIET_EVAL, false); $a = array( 'self::html_entity_decode(""&<>", true) === "\"&<>"', @@ -4043,6 +4150,9 @@ class UTF8 'self::is_utf8(file_get_contents(' . var_export(__FILE__, true) . ', true)) === true', 'self::is_utf8(file_get_contents(' . var_export(__FILE__, true) . ', false)) === true', 'self::is_ascii(file_get_contents(' . var_export(__FILE__, true) . ')) === false', + 'self::is_ascii("_\x01\x02абв", $error_char_offset) === false && $error_char_offset === 3', + 'self::has_binary(file_get_contents(' . var_export(__FILE__, true) . ')) === false', + 'self::has_binary("_аб\x01вг", $found_char_offset) === true && $found_char_offset === 3', #range() uses ord() and chr() 'self::range("A", "D") === array("A", "B", "C", "D")', @@ -4058,10 +4168,22 @@ class UTF8 'self::preg_quote_case_insensitive("123_слово_test") === "123_(с|С)(л|Л)(о|О)(в|В)(о|О)_[tT][eE][sS][tT]"', 'self::preg_quote_case_insensitive("123_test") === "(?i:123_test)"', + 'self::preg_quote_case_insensitive("123") === "123"', + + 'self::unescape("%D1%82%D0%B5%D1%81%D1%82") === "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"', + 'self::unescape("0xD182D0B5D181D182", true) === "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"', + 'self::unescape("%u0442%u0435%u0441%u0442") === "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"', + 'self::unescape("%u{442}%u{435}%u{0441}%u{00442}") === "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"', + 'self::unescape("%u0025%u0032%u0035+%25%75%30%30%32%35") === "%25 %u0025"', + + 'self::ucfirst("!@#$", true) === "!@#$"', + 'self::ucfirst("!@#$ test", true) === "!@#$ test"', + 'self::ucfirst("«северный Поток»", true) === "«Северный поток»"', + 'self::ucfirst("«северный Поток»", false) === "«Северный Поток»"', //'self::strlen(file_get_contents(' . var_export(__FILE__, true) . ', true))' ); - foreach ($a as $k => $v) if (! assert($v)) return false; + foreach ($a as $k => $v) if (!assert($v)) return false; //$start_time = microtime(true); //$s = file_get_contents(__FILE__); @@ -4072,4 +4194,4 @@ class UTF8 return true; } -} \ No newline at end of file +}