0byt3m1n1
Path:
/
data
/
applications
/
aps
/
mantis
/
1.2.7-0
/
standard
/
htdocs
/
library
/
utf8
/
exp
/
[
Home
]
File: regexunicode.php
<?php /** * This was an experiment to see how a PCRE based UTF-8 to unicode * code point converter would perform, vs. a character by character * converted (as in '../utf8_unicode.php'). Basically this is very * by comparion but perhaps interesting code anyway */ $UTF8_MATCH = '([\x09\x0A\x0D\x20-\x7E])'. # ASCII (excluding control chars) '|([\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte '|\xE0[\xA0-\xBF][\x80-\xBF])'. # excluding overlongs '|([\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 '|\xF4[\x80-\x8F][\x80-\xBF]{2})'; # plane 16 '|(.{1})'; # catch bad bytes function toCodePoint($matches) { global $points; if ( $matches[1] != '' ) { $points[]= ord($matches[1]); } else if ( $matches[2] != '' ) { $points[]= ( ( ord($matches[2][0]) % 32 ) * 64 ) + ( ord($matches[2][1]) % 64 ); } else if ( $matches[3] != '' ) { $points[]= ( ( ord($matches[3][0]) % 16 ) * 4096 ) + ( ( ord($matches[3][1]) % 64 ) * 64 ) + ( ord($matches[3][2]) % 64 ); } else if ( $matches[4] != '' ) { trigger_error('Invalid byte in UTF-8',E_USER_WARNING); return ''; } return $matches[0]; } $str = file_get_contents('../tests/data/utf8.html'); $points = array(); preg_replace_callback('/'.$UTF8_MATCH.'/S','toCodePoint',$str); print_r($points);