StringHelper.php 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. <?php
  2. /**
  3. * 多功能字符串工具
  4. *
  5. * @author hightman <hightman@twomice.net>
  6. * @link http://www.hightman.cn/
  7. * @copyright Copyright &copy; 2008-2013 Twomice Studio
  8. */
  9. /**
  10. * String Helper (all are static function)
  11. *
  12. * <pre>
  13. * StringHelper::decodeHtml($html);
  14. * StringHelper::fixHtmlCharset($html, $charset = 'utf-8');
  15. * StringHelper::finds($buf, $tag1, $tag2[, ...]);
  16. * StringHelper::find($buf, $tag1, $tag2[, ...]);
  17. * StringHelper::contains($buf, $tokens);
  18. * </pre>
  19. */
  20. class StringHelper
  21. {
  22. /**
  23. * @param string $html
  24. * @return string 解码后的 html
  25. */
  26. public static function decodeHtml($html)
  27. {
  28. if (strpos($html, '<') !== false) {
  29. $html = strip_tags($html); /* preg_replace('/<.+?>/u', '', $html); */
  30. }
  31. return html_entity_decode(trim($html), ENT_QUOTES, 'utf-8');
  32. }
  33. /**
  34. * @param string $charset 目标字符集,默认 utf-8
  35. * @return string 强制转换网页内容为目标字符集
  36. */
  37. public static function fixHtmlCharset($html, $charset = 'utf-8')
  38. {
  39. if (preg_match('/charset=["\']?([0-9a-zA-Z_-]+)/', $html, $match)
  40. && (strncasecmp($charset, 'gb', 2) || strncasecmp($match[1], 'gb', 2))
  41. && strcasecmp($charset, $match[1])) {
  42. if (!strcasecmp($match[1], 'gb2312')) {
  43. $match[1] = 'gbk';
  44. }
  45. if (function_exists('iconv')) {
  46. return iconv($match[1], $charset . '//IGNORE', $html);
  47. } elseif (function_exists('mb_convert_encoding')) {
  48. return mb_convert_encoding($html, $charset, $match[1]);
  49. }
  50. }
  51. return $html;
  52. }
  53. /**
  54. * 根据标记快速查找字符串列表
  55. * @param string $buf
  56. * @param array $config
  57. * array(
  58. * array(key1, arg1, arg2, ...),
  59. * array(key2, arg1, arg2, ...),
  60. * ),
  61. * @return array
  62. * @see StringMatcher::find
  63. */
  64. public static function finds($buf, $config, &$error = null)
  65. {
  66. $obj = new StringMatcher($buf);
  67. return $obj->finds($config, $error);
  68. }
  69. /**
  70. * 根据标记快速查找字符串
  71. * @param string $buf
  72. * @return string 返回最后两个标记之间的内容,找不到返回 null
  73. * @see StringMatcher::find
  74. */
  75. public static function find($buf)
  76. {
  77. $args = func_get_args();
  78. array_shift($args);
  79. $obj = new StringMatcher($buf);
  80. return call_user_func_array(array($obj, 'find'), $args);
  81. }
  82. /**
  83. * 判断字符串是否包含数组中的字符串
  84. * @param string $buf 源字符串
  85. * @param array $tokens 字符串标记列表
  86. * @return boolean
  87. */
  88. public static function contains($buf, $tokens)
  89. {
  90. foreach ($tokens as $token) {
  91. if (strpos($buf, $token) !== false) {
  92. return true;
  93. }
  94. }
  95. return false;
  96. }
  97. }
  98. /**
  99. * StringMatcher to parse data
  100. */
  101. class StringMatcher
  102. {
  103. private $_buf, $_pos;
  104. /**
  105. * @param string $buf
  106. */
  107. public function __construct($buf)
  108. {
  109. $this->_buf = $buf;
  110. $this->_pos = 0;
  111. }
  112. /**
  113. * 批量查找
  114. * @param array $config
  115. * array(
  116. * array(key1, arg1, arg2, ...),
  117. * array(key2, arg1, arg2, ...),
  118. * ),
  119. * @param string $error optional reference
  120. * @return array
  121. */
  122. public function finds($config, &$error = null)
  123. {
  124. $ret = array();
  125. foreach ($config as $args) {
  126. $key = array_shift($args);
  127. $val = call_user_func_array(array($this, 'find'), $args);
  128. if ($val === null || $val === false) {
  129. $error = 'Cannot find `' . $key . '\': ' . implode(' ... ', $args);
  130. $pos = strrpos($error, '...');
  131. $error = substr_replace($error, '???', $pos, 3);
  132. continue;
  133. //return false;
  134. }
  135. $ret[$key] = $val;
  136. }
  137. return $ret;
  138. }
  139. /**
  140. * 根据特征查找字符串,不定参数:
  141. * 起始1,起始2,起始3 ... 结束关键
  142. * 新增支持特殊串
  143. * "$$$...",表示后面的字符串必须在这个字符串之前,以免跨越太大
  144. * "^^^...",表示后面的字符串如果在这个串之前就用采用当前串的位置
  145. * @return string 成功返回区间内的字符串并将位置设在本字符串之末,若找不到返回 null
  146. */
  147. public function find()
  148. {
  149. $args = func_get_args();
  150. $cnt = count($args);
  151. if ($cnt < 2) {
  152. return trigger_error(__CLASS__ . '::find() expects at least 2 parameters, ' . $cnt . ' given', E_USER_WARNING);
  153. }
  154. for ($end = $pre = false, $pos1 = $this->_pos, $i = 0; $i < ($cnt - 1); $i++) {
  155. if (substr($args[$i], 0, 3) === '$$$') {
  156. $end = strpos($this->_buf, substr($args[$i], 3), $pos1);
  157. } elseif (substr($args[$i], 0, 3) === '^^^') {
  158. $pre = strpos($this->_buf, substr($args[$i], 3), $pos1);
  159. } else {
  160. $pos1 = strpos($this->_buf, $args[$i], $pos1);
  161. if ($pos1 === false) {
  162. return null;
  163. } elseif ($end !== false && $pos1 > $end) {
  164. return '';
  165. }
  166. if ($pre !== false) {
  167. if ($pos1 > $pre) {
  168. $pos1 = $pre;
  169. }
  170. $pre = false;
  171. }
  172. $pos1 += strlen($args[$i]);
  173. }
  174. }
  175. if (($pos2 = strpos($this->_buf, $args[$i], $pos1)) !== false) {
  176. if ($end !== false && $pos2 > $end) {
  177. return '';
  178. }
  179. if ($pre !== false) {
  180. if ($pos2 > $pre) {
  181. $pos2 = $pre;
  182. }
  183. $pre = false;
  184. }
  185. $this->_pos = $pos2;
  186. return substr($this->_buf, $pos1, $pos2 - $pos1);
  187. }
  188. return null;
  189. }
  190. /**
  191. * 移动当前处理位置位置指针,类似 fseek
  192. * @param int $offset
  193. * @param int $whence 可选值:SEEK_SET/SEEK_CUR/SEEK_END
  194. */
  195. public function seek($offset, $whence = SEEK_CUR)
  196. {
  197. $offset = intval($offset);
  198. switch ($whence) {
  199. case SEEK_SET:
  200. $this->_pos = $offset;
  201. break;
  202. case SEEK_END:
  203. $this->_pos = $offset + strlen($this->_buf);
  204. break;
  205. case SEEK_CUR:
  206. default:
  207. $this->_pos += $offset;
  208. break;
  209. }
  210. return $this->_pos;
  211. }
  212. }