Geen omschrijving
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Pinyin.php 9.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. <?php
  2. /*
  3. * This file is part of the overtrue/pinyin.
  4. *
  5. * (c) overtrue <i@overtrue.me>
  6. *
  7. * This source file is subject to the MIT license that is bundled
  8. * with this source code in the file LICENSE.
  9. */
  10. namespace Overtrue\Pinyin;
  11. use InvalidArgumentException;
  12. class Pinyin
  13. {
  14. /**
  15. * Dict loader.
  16. *
  17. * @var \Overtrue\Pinyin\DictLoaderInterface
  18. */
  19. protected $loader;
  20. /**
  21. * Punctuations map.
  22. *
  23. * @var array
  24. */
  25. protected $punctuations = [
  26. ',' => ',',
  27. '。' => '.',
  28. '!' => '!',
  29. '?' => '?',
  30. ':' => ':',
  31. '“' => '"',
  32. '”' => '"',
  33. '‘' => "'",
  34. '’' => "'",
  35. '_' => '_',
  36. ];
  37. /**
  38. * Constructor.
  39. *
  40. * @param string $loaderName
  41. */
  42. public function __construct($loaderName = null)
  43. {
  44. $this->loader = $loaderName ?: 'Overtrue\\Pinyin\\FileDictLoader';
  45. }
  46. /**
  47. * Convert string to pinyin.
  48. *
  49. * @param string $string
  50. * @param int $option
  51. *
  52. * @return array
  53. */
  54. public function convert($string, $option = PINYIN_DEFAULT)
  55. {
  56. $pinyin = $this->romanize($string, $option);
  57. return $this->splitWords($pinyin, $option);
  58. }
  59. /**
  60. * Convert string (person name) to pinyin.
  61. *
  62. * @param string $stringName
  63. * @param int $option
  64. *
  65. * @return array
  66. */
  67. public function name($stringName, $option = PINYIN_NAME)
  68. {
  69. $option = $option | PINYIN_NAME;
  70. $pinyin = $this->romanize($stringName, $option);
  71. return $this->splitWords($pinyin, $option);
  72. }
  73. /**
  74. * Return a pinyin permalink from string.
  75. *
  76. * @param string $string
  77. * @param string $delimiter
  78. * @param int $option
  79. *
  80. * @return string
  81. */
  82. public function permalink($string, $delimiter = '-', $option = PINYIN_DEFAULT)
  83. {
  84. if (\is_int($delimiter)) {
  85. list($option, $delimiter) = [$delimiter, '-'];
  86. }
  87. if (!in_array($delimiter, ['_', '-', '.', ''], true)) {
  88. throw new InvalidArgumentException("Delimiter must be one of: '_', '-', '', '.'.");
  89. }
  90. return implode($delimiter, $this->convert($string, $option | \PINYIN_KEEP_NUMBER | \PINYIN_KEEP_ENGLISH));
  91. }
  92. /**
  93. * Return first letters.
  94. *
  95. * @param string $string
  96. * @param string $delimiter
  97. * @param int $option
  98. *
  99. * @return string
  100. */
  101. public function abbr($string, $delimiter = '', $option = PINYIN_DEFAULT)
  102. {
  103. if (\is_int($delimiter)) {
  104. list($option, $delimiter) = [$delimiter, ''];
  105. }
  106. return implode($delimiter, array_map(function ($pinyin) {
  107. return \is_numeric($pinyin) || preg_match('/\d+/', $pinyin) ? $pinyin : mb_substr($pinyin, 0, 1);
  108. }, $this->convert($string, $option | PINYIN_NO_TONE)));
  109. }
  110. /**
  111. * Chinese phrase to pinyin.
  112. *
  113. * @param string $string
  114. * @param string $delimiter
  115. * @param int $option
  116. *
  117. * @return string
  118. */
  119. public function phrase($string, $delimiter = ' ', $option = PINYIN_DEFAULT)
  120. {
  121. if (\is_int($delimiter)) {
  122. list($option, $delimiter) = [$delimiter, ' '];
  123. }
  124. return implode($delimiter, $this->convert($string, $option));
  125. }
  126. /**
  127. * Chinese to pinyin sentence.
  128. *
  129. * @param string $string
  130. * @param string $delimiter
  131. * @param int $option
  132. *
  133. * @return string
  134. */
  135. public function sentence($string, $delimiter = ' ', $option = \PINYIN_NO_TONE)
  136. {
  137. if (\is_int($delimiter)) {
  138. list($option, $delimiter) = [$delimiter, ' '];
  139. }
  140. return implode($delimiter, $this->convert($string, $option | \PINYIN_KEEP_PUNCTUATION | \PINYIN_KEEP_ENGLISH | \PINYIN_KEEP_NUMBER));
  141. }
  142. /**
  143. * Loader setter.
  144. *
  145. * @param \Overtrue\Pinyin\DictLoaderInterface $loader
  146. *
  147. * @return $this
  148. */
  149. public function setLoader(DictLoaderInterface $loader)
  150. {
  151. $this->loader = $loader;
  152. return $this;
  153. }
  154. /**
  155. * Return dict loader,.
  156. *
  157. * @return \Overtrue\Pinyin\DictLoaderInterface
  158. */
  159. public function getLoader()
  160. {
  161. if (!($this->loader instanceof DictLoaderInterface)) {
  162. $dataDir = dirname(__DIR__) . '/data/';
  163. $loaderName = $this->loader;
  164. $this->loader = new $loaderName($dataDir);
  165. }
  166. return $this->loader;
  167. }
  168. /**
  169. * Convert Chinese to pinyin.
  170. *
  171. * @param string $string
  172. * @param int $option
  173. *
  174. * @return string
  175. */
  176. protected function romanize($string, $option = \PINYIN_DEFAULT)
  177. {
  178. $string = $this->prepare($string, $option);
  179. $dictLoader = $this->getLoader();
  180. if ($this->hasOption($option, \PINYIN_NAME)) {
  181. $string = $this->convertSurname($string, $dictLoader);
  182. }
  183. $dictLoader->map(function ($dictionary) use (&$string) {
  184. $string = strtr($string, $dictionary);
  185. });
  186. return $string;
  187. }
  188. /**
  189. * Convert Chinese Surname to pinyin.
  190. *
  191. * @param string $string
  192. * @param \Overtrue\Pinyin\DictLoaderInterface $dictLoader
  193. *
  194. * @return string
  195. */
  196. protected function convertSurname($string, $dictLoader)
  197. {
  198. $dictLoader->mapSurname(function ($dictionary) use (&$string) {
  199. foreach ($dictionary as $surname => $pinyin) {
  200. if (0 === strpos($string, $surname)) {
  201. $string = $pinyin . mb_substr($string, mb_strlen($surname, 'UTF-8'), mb_strlen($string, 'UTF-8') - 1, 'UTF-8');
  202. break;
  203. }
  204. }
  205. });
  206. return $string;
  207. }
  208. /**
  209. * Split pinyin string to words.
  210. *
  211. * @param string $pinyin
  212. * @param string $option
  213. *
  214. * @return array
  215. */
  216. protected function splitWords($pinyin, $option)
  217. {
  218. $split = array_filter(preg_split('/\s+/i', $pinyin));
  219. if (!$this->hasOption($option, PINYIN_TONE)) {
  220. foreach ($split as $index => $pinyin) {
  221. $split[$index] = $this->formatTone($pinyin, $option);
  222. }
  223. }
  224. return array_values($split);
  225. }
  226. /**
  227. * @param int $option
  228. * @param int $check
  229. *
  230. * @return bool
  231. */
  232. public function hasOption($option, $check)
  233. {
  234. return ($option & $check) === $check;
  235. }
  236. /**
  237. * Pre-process.
  238. *
  239. * @param string $string
  240. * @param int $option
  241. *
  242. * @return string
  243. */
  244. protected function prepare($string, $option = \PINYIN_DEFAULT)
  245. {
  246. $string = preg_replace_callback('~[a-z0-9_-]+~i', function ($matches) {
  247. return "\t" . $matches[0];
  248. }, $string);
  249. $regex = ['\x{3007}\x{2E80}-\x{2FFF}\x{3100}-\x{312F}\x{31A0}-\x{31EF}\x{3400}-\x{4DBF}\x{4E00}-\x{9FFF}\x{F900}-\x{FAFF}', '\p{Z}', '\p{M}', "\t"];
  250. if ($this->hasOption($option, \PINYIN_KEEP_NUMBER)) {
  251. \array_push($regex, '0-9');
  252. }
  253. if ($this->hasOption($option, \PINYIN_KEEP_ENGLISH)) {
  254. \array_push($regex, 'a-zA-Z');
  255. }
  256. if ($this->hasOption($option, \PINYIN_KEEP_PUNCTUATION)) {
  257. $punctuations = array_merge($this->punctuations, ["\t" => ' ', ' ' => ' ']);
  258. $string = trim(str_replace(array_keys($punctuations), $punctuations, $string));
  259. \array_push($regex, preg_quote(implode(array_merge(array_keys($this->punctuations), $this->punctuations)), '~'));
  260. }
  261. return preg_replace(\sprintf('~[^%s]~u', implode($regex)), '', $string);
  262. }
  263. /**
  264. * Format.
  265. *
  266. * @param string $pinyin
  267. * @param int $option
  268. *
  269. * @return string
  270. */
  271. protected function formatTone($pinyin, $option = \PINYIN_NO_TONE)
  272. {
  273. $replacements = [
  274. // mb_chr(593) => 'ɑ' 轻声中除了 `ɑ` 和 `ü` 以外,其它和字母一样
  275. 'ɑ' => ['a', 5], 'ü' => ['yu', 5],
  276. 'üē' => ['ue', 1], 'üé' => ['ue', 2], 'üě' => ['ue', 3], 'üè' => ['ue', 4],
  277. 'ā' => ['a', 1], 'ē' => ['e', 1], 'ī' => ['i', 1], 'ō' => ['o', 1], 'ū' => ['u', 1], 'ǖ' => ['yu', 1],
  278. 'á' => ['a', 2], 'é' => ['e', 2], 'í' => ['i', 2], 'ó' => ['o', 2], 'ú' => ['u', 2], 'ǘ' => ['yu', 2],
  279. 'ǎ' => ['a', 3], 'ě' => ['e', 3], 'ǐ' => ['i', 3], 'ǒ' => ['o', 3], 'ǔ' => ['u', 3], 'ǚ' => ['yu', 3],
  280. 'à' => ['a', 4], 'è' => ['e', 4], 'ì' => ['i', 4], 'ò' => ['o', 4], 'ù' => ['u', 4], 'ǜ' => ['yu', 4],
  281. ];
  282. foreach ($replacements as $unicode => $replacement) {
  283. if (false !== strpos($pinyin, $unicode)) {
  284. $umlaut = $replacement[0];
  285. // https://zh.wikipedia.org/wiki/%C3%9C
  286. if ($this->hasOption($option, \PINYIN_UMLAUT_V) && 'yu' == $umlaut) {
  287. $umlaut = 'v';
  288. }
  289. $pinyin = str_replace($unicode, $umlaut, $pinyin) . ($this->hasOption($option, PINYIN_ASCII_TONE) ? $replacement[1] : '');
  290. }
  291. }
  292. return $pinyin;
  293. }
  294. }