123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343 |
- <?php
-
- /*
- * This file is part of the overtrue/pinyin.
- *
- * (c) overtrue <i@overtrue.me>
- *
- * This source file is subject to the MIT license that is bundled
- * with this source code in the file LICENSE.
- */
-
- namespace Overtrue\Pinyin;
-
- use InvalidArgumentException;
-
- class Pinyin
- {
- /**
- * Dict loader.
- *
- * @var \Overtrue\Pinyin\DictLoaderInterface
- */
- protected $loader;
-
- /**
- * Punctuations map.
- *
- * @var array
- */
- protected $punctuations = [
- ',' => ',',
- '。' => '.',
- '!' => '!',
- '?' => '?',
- ':' => ':',
- '“' => '"',
- '”' => '"',
- '‘' => "'",
- '’' => "'",
- '_' => '_',
- ];
-
- /**
- * Constructor.
- *
- * @param string $loaderName
- */
- public function __construct($loaderName = null)
- {
- $this->loader = $loaderName ?: 'Overtrue\\Pinyin\\FileDictLoader';
- }
-
- /**
- * Convert string to pinyin.
- *
- * @param string $string
- * @param int $option
- *
- * @return array
- */
- public function convert($string, $option = PINYIN_DEFAULT)
- {
- $pinyin = $this->romanize($string, $option);
-
- return $this->splitWords($pinyin, $option);
- }
-
- /**
- * Convert string (person name) to pinyin.
- *
- * @param string $stringName
- * @param int $option
- *
- * @return array
- */
- public function name($stringName, $option = PINYIN_NAME)
- {
- $option = $option | PINYIN_NAME;
-
- $pinyin = $this->romanize($stringName, $option);
-
- return $this->splitWords($pinyin, $option);
- }
-
- /**
- * Return a pinyin permalink from string.
- *
- * @param string $string
- * @param string $delimiter
- * @param int $option
- *
- * @return string
- */
- public function permalink($string, $delimiter = '-', $option = PINYIN_DEFAULT)
- {
- if (\is_int($delimiter)) {
- list($option, $delimiter) = [$delimiter, '-'];
- }
-
- if (!in_array($delimiter, ['_', '-', '.', ''], true)) {
- throw new InvalidArgumentException("Delimiter must be one of: '_', '-', '', '.'.");
- }
-
- return implode($delimiter, $this->convert($string, $option | \PINYIN_KEEP_NUMBER | \PINYIN_KEEP_ENGLISH));
- }
-
- /**
- * Return first letters.
- *
- * @param string $string
- * @param string $delimiter
- * @param int $option
- *
- * @return string
- */
- public function abbr($string, $delimiter = '', $option = PINYIN_DEFAULT)
- {
- if (\is_int($delimiter)) {
- list($option, $delimiter) = [$delimiter, ''];
- }
-
- return implode($delimiter, array_map(function ($pinyin) {
- return \is_numeric($pinyin) || preg_match('/\d+/', $pinyin) ? $pinyin : mb_substr($pinyin, 0, 1);
- }, $this->convert($string, $option | PINYIN_NO_TONE)));
- }
-
- /**
- * Chinese phrase to pinyin.
- *
- * @param string $string
- * @param string $delimiter
- * @param int $option
- *
- * @return string
- */
- public function phrase($string, $delimiter = ' ', $option = PINYIN_DEFAULT)
- {
- if (\is_int($delimiter)) {
- list($option, $delimiter) = [$delimiter, ' '];
- }
-
- return implode($delimiter, $this->convert($string, $option));
- }
-
- /**
- * Chinese to pinyin sentence.
- *
- * @param string $string
- * @param string $delimiter
- * @param int $option
- *
- * @return string
- */
- public function sentence($string, $delimiter = ' ', $option = \PINYIN_NO_TONE)
- {
- if (\is_int($delimiter)) {
- list($option, $delimiter) = [$delimiter, ' '];
- }
-
- return implode($delimiter, $this->convert($string, $option | \PINYIN_KEEP_PUNCTUATION | \PINYIN_KEEP_ENGLISH | \PINYIN_KEEP_NUMBER));
- }
-
- /**
- * Loader setter.
- *
- * @param \Overtrue\Pinyin\DictLoaderInterface $loader
- *
- * @return $this
- */
- public function setLoader(DictLoaderInterface $loader)
- {
- $this->loader = $loader;
-
- return $this;
- }
-
- /**
- * Return dict loader,.
- *
- * @return \Overtrue\Pinyin\DictLoaderInterface
- */
- public function getLoader()
- {
- if (!($this->loader instanceof DictLoaderInterface)) {
- $dataDir = dirname(__DIR__) . '/data/';
-
- $loaderName = $this->loader;
- $this->loader = new $loaderName($dataDir);
- }
-
- return $this->loader;
- }
-
- /**
- * Convert Chinese to pinyin.
- *
- * @param string $string
- * @param int $option
- *
- * @return string
- */
- protected function romanize($string, $option = \PINYIN_DEFAULT)
- {
- $string = $this->prepare($string, $option);
-
- $dictLoader = $this->getLoader();
-
- if ($this->hasOption($option, \PINYIN_NAME)) {
- $string = $this->convertSurname($string, $dictLoader);
- }
-
- $dictLoader->map(function ($dictionary) use (&$string) {
- $string = strtr($string, $dictionary);
- });
-
- return $string;
- }
-
- /**
- * Convert Chinese Surname to pinyin.
- *
- * @param string $string
- * @param \Overtrue\Pinyin\DictLoaderInterface $dictLoader
- *
- * @return string
- */
- protected function convertSurname($string, $dictLoader)
- {
- $dictLoader->mapSurname(function ($dictionary) use (&$string) {
- foreach ($dictionary as $surname => $pinyin) {
- if (0 === strpos($string, $surname)) {
- $string = $pinyin . mb_substr($string, mb_strlen($surname, 'UTF-8'), mb_strlen($string, 'UTF-8') - 1, 'UTF-8');
-
- break;
- }
- }
- });
-
- return $string;
- }
-
- /**
- * Split pinyin string to words.
- *
- * @param string $pinyin
- * @param string $option
- *
- * @return array
- */
- protected function splitWords($pinyin, $option)
- {
- $split = array_filter(preg_split('/\s+/i', $pinyin));
-
- if (!$this->hasOption($option, PINYIN_TONE)) {
- foreach ($split as $index => $pinyin) {
- $split[$index] = $this->formatTone($pinyin, $option);
- }
- }
-
- return array_values($split);
- }
-
- /**
- * @param int $option
- * @param int $check
- *
- * @return bool
- */
- public function hasOption($option, $check)
- {
- return ($option & $check) === $check;
- }
-
- /**
- * Pre-process.
- *
- * @param string $string
- * @param int $option
- *
- * @return string
- */
- protected function prepare($string, $option = \PINYIN_DEFAULT)
- {
- $string = preg_replace_callback('~[a-z0-9_-]+~i', function ($matches) {
- return "\t" . $matches[0];
- }, $string);
-
- $regex = ['\x{3007}\x{2E80}-\x{2FFF}\x{3100}-\x{312F}\x{31A0}-\x{31EF}\x{3400}-\x{4DBF}\x{4E00}-\x{9FFF}\x{F900}-\x{FAFF}', '\p{Z}', '\p{M}', "\t"];
-
- if ($this->hasOption($option, \PINYIN_KEEP_NUMBER)) {
- \array_push($regex, '0-9');
- }
-
- if ($this->hasOption($option, \PINYIN_KEEP_ENGLISH)) {
- \array_push($regex, 'a-zA-Z');
- }
-
- if ($this->hasOption($option, \PINYIN_KEEP_PUNCTUATION)) {
- $punctuations = array_merge($this->punctuations, ["\t" => ' ', ' ' => ' ']);
- $string = trim(str_replace(array_keys($punctuations), $punctuations, $string));
-
- \array_push($regex, preg_quote(implode(array_merge(array_keys($this->punctuations), $this->punctuations)), '~'));
- }
-
- return preg_replace(\sprintf('~[^%s]~u', implode($regex)), '', $string);
- }
-
- /**
- * Format.
- *
- * @param string $pinyin
- * @param int $option
- *
- * @return string
- */
- protected function formatTone($pinyin, $option = \PINYIN_NO_TONE)
- {
- $replacements = [
- // mb_chr(593) => 'ɑ' 轻声中除了 `ɑ` 和 `ü` 以外,其它和字母一样
- 'ɑ' => ['a', 5], 'ü' => ['yu', 5],
- 'üē' => ['ue', 1], 'üé' => ['ue', 2], 'üě' => ['ue', 3], 'üè' => ['ue', 4],
- 'ā' => ['a', 1], 'ē' => ['e', 1], 'ī' => ['i', 1], 'ō' => ['o', 1], 'ū' => ['u', 1], 'ǖ' => ['yu', 1],
- 'á' => ['a', 2], 'é' => ['e', 2], 'í' => ['i', 2], 'ó' => ['o', 2], 'ú' => ['u', 2], 'ǘ' => ['yu', 2],
- 'ǎ' => ['a', 3], 'ě' => ['e', 3], 'ǐ' => ['i', 3], 'ǒ' => ['o', 3], 'ǔ' => ['u', 3], 'ǚ' => ['yu', 3],
- 'à' => ['a', 4], 'è' => ['e', 4], 'ì' => ['i', 4], 'ò' => ['o', 4], 'ù' => ['u', 4], 'ǜ' => ['yu', 4],
- ];
-
- foreach ($replacements as $unicode => $replacement) {
- if (false !== strpos($pinyin, $unicode)) {
- $umlaut = $replacement[0];
-
- // https://zh.wikipedia.org/wiki/%C3%9C
- if ($this->hasOption($option, \PINYIN_UMLAUT_V) && 'yu' == $umlaut) {
- $umlaut = 'v';
- }
-
- $pinyin = str_replace($unicode, $umlaut, $pinyin) . ($this->hasOption($option, PINYIN_ASCII_TONE) ? $replacement[1] : '');
- }
- }
-
- return $pinyin;
- }
- }
|