From 6f7d64dacb3e51835590d417255c1bf61c0e5f5b Mon Sep 17 00:00:00 2001 From: mozillazg Date: Sat, 6 Apr 2019 20:48:43 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E5=90=88=E5=B9=B6=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E6=97=B6=E4=B8=8D=E5=86=8D=E7=89=B9=E6=AE=8A=E5=AF=B9=E5=BE=85?= =?UTF-8?q?=20kHanyuPinlu.txt=EF=BC=8C=E6=94=B9=E6=88=90=E6=AD=A3=E5=B8=B8?= =?UTF-8?q?=20merge?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- merge_unihan.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/merge_unihan.py b/merge_unihan.py index 678152c..08dbfd3 100644 --- a/merge_unihan.py +++ b/merge_unihan.py @@ -63,6 +63,7 @@ def extend_pinyins(old_map, new_map, only_no_exists=False): else: old_map.setdefault(code, []).extend(pinyins) + if __name__ == '__main__': raw_pinyin_map = {} with open('kHanyuPinyin.txt') as fp: @@ -87,10 +88,8 @@ def extend_pinyins(old_map, new_map, only_no_exists=False): extend_pinyins(raw_pinyin_map, adjust_pinyin_map) with open('kHanyuPinlu.txt') as fp: khanyupinyinlu = parse_pinyins(fp) - # 之所以只增加不存在的拼音数据而不更新已有的数据 - # 是因为 kHanyuPinlu 的拼音数据中存在一部分不需要的轻声拼音 - # 以及部分音调标错了位置,比如把 ``ǒu`` 标成了 ``oǔ`` - extend_pinyins(raw_pinyin_map, khanyupinyinlu, only_no_exists=True) + extend_pinyins(adjust_pinyin_map, _map) + extend_pinyins(raw_pinyin_map, adjust_pinyin_map) with open('GBK_PUA.txt') as fp: pua_pinyin_map = parse_pinyins(fp) extend_pinyins(raw_pinyin_map, pua_pinyin_map) From 6f3e77ed5b382bea1c1c7daf05fbdae6b4e8ee13 Mon Sep 17 00:00:00 2001 From: mozillazg Date: Sat, 1 Jun 2019 17:20:58 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E6=8A=8A=E6=97=A5=E6=9C=AC=E8=87=AA?= =?UTF-8?q?=E9=80=A0=E6=B1=89=E5=AD=97=E7=9A=84=E6=8B=BC=E9=9F=B3=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E7=A7=BB=E5=88=B0=20kanji.txt=20=E4=B8=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 8 +++++++- README.md | 6 +++++- kMandarin_overwrite.txt | 34 ---------------------------------- kanji.txt | 32 ++++++++++++++++++++++++++++++++ merge_unihan.py | 3 +++ overwrite.txt | 2 ++ pinyin.txt | 4 ++-- 7 files changed, 51 insertions(+), 38 deletions(-) create mode 100644 kanji.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index 4443ac1..38b5a29 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # ChangeLog +## [0.8.0] (2019-06-01) + +* 增加 `kanji.txt` 日本自造汉字的拼音数据 via [#32]. Thanks [@LuoZijun](https://github.com/LuoZijun) + ## [0.7.0] (2019-03-31) @@ -23,7 +27,7 @@ ## [0.5.1] (2018-04-19) -* 更正 `卓`、`啥` 的拼音数据 via [#26] 。Thanks [shibingli](https://github.com/shibingli) +* 更正 `卓`、`啥` 的拼音数据 via [#26] 。Thanks [@shibingli](https://github.com/shibingli) * 更新 `〇` 的拼音数据 via [#27] @@ -80,6 +84,7 @@ [#27]: https://github.com/mozillazg/pinyin-data/pull/27 [68dc169]: https://github.com/mozillazg/pinyin-data/commit/68dc169c3f0f02cb9bf53290edab2d2d2463e0c5 [8802f31]: https://github.com/mozillazg/pinyin-data/commit/8802f31e0e65c6e34a497adb55993425741a9d41 +[#32]: https://github.com/mozillazg/pinyin-data/pull/32 [0.2.0]: https://github.com/mozillazg/pinyin-data/compare/v0.1.0...v0.2.0 [0.3.0]: https://github.com/mozillazg/pinyin-data/compare/v0.2.0...v0.3.0 @@ -91,3 +96,4 @@ [0.6.1]: https://github.com/mozillazg/pinyin-data/compare/v0.6.0...v0.6.1 [0.6.2]: https://github.com/mozillazg/pinyin-data/compare/v0.6.1...v0.6.2 [0.7.0]: https://github.com/mozillazg/pinyin-data/compare/v0.6.2...v0.7.0 +[0.8.0]: https://github.com/mozillazg/pinyin-data/compare/v0.7.0...v0.8.0 diff --git a/README.md b/README.md index 866c011..864e3e3 100644 --- a/README.md +++ b/README.md @@ -29,14 +29,16 @@ * `kMandarin_overwrite.txt`: 手工纠正 `kMandarin.txt` 中有误的拼音数据(**可以修改**) * `GBK_PUA.txt`: [Private Use Area](https://en.wikipedia.org/wiki/Private_Use_Areas) 中有拼音的汉字,参考 [GB 18030 - 维基百科,自由的百科全书](https://zh.wikipedia.org/wiki/GB_18030#PUA) (**可以修改**) * `nonCJKUI.txt`: 不属于 [CJK Unified Ideograph](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs) 但是却有拼音的字符(**可以修改**) +* `kanji.txt`: [日本自造汉字](https://zh.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC%E6%B1%89%E5%AD%97#7_%E6%97%A5%E6%9C%AC%E6%B1%89%E5%AD%97%E7%9A%84%E6%B1%89%E8%AF%AD%E6%99%AE%E9%80%9A%E8%AF%9D%E8%A7%84%E8%8C%83%E8%AF%BB%E9%9F%B3%E8%A1%A8) 的拼音数据 (**可以修改**) * `kMandarin_8105.txt`: [《通用规范汉字表》](https://zh.wikipedia.org/wiki/通用规范汉字表)(2013 年版)里 8105 个汉字最常用的一个读音 (**可以修改**) * `overwrite.txt`: 手工纠正的拼音数据(**可以修改**) * `pinyin.txt`: 合并上述文件后的拼音数据 -* `zdic.txt`: [汉典网](http://zdic.net) 的拼音数据 +* `zdic.txt`: [汉典网](http://zdic.net) 的拼音数据(**可以修改**) ## 参考资料 +* [汉语拼音方案](http://www.moe.edu.cn/s78/A19/yxs_left/moe_810/s230/195802/t19580201_186000.html) * [Unihan Database Lookup](http://www.unicode.org/charts/unihan.html) * [汉典 zdic.net](http://www.zdic.net/) * [字海网,叶典网](http://zisea.com/) @@ -45,6 +47,8 @@ * [GB 18030 - 维基百科,自由的百科全书](https://zh.wikipedia.org/wiki/GB_18030#PUA) * [通用规范汉字表 - 维基百科,自由的百科全书](https://zh.wikipedia.org/wiki/%E9%80%9A%E7%94%A8%E8%A7%84%E8%8C%83%E6%B1%89%E5%AD%97%E8%A1%A8) * [China’s 通用规范汉字表 (Tōngyòng Guīfàn Hànzìbiǎo)](https://blogs.adobe.com/CCJKType/2014/03/china-8105.html) +* [日本汉字的汉语读音规范](http://www.moe.gov.cn/s78/A19/yxs_left/moe_810/s230/201001/t20100115_75698.html) +* [日本汉字的汉语普通话规范读音表- 维基百科](https://zh.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC%E6%B1%89%E5%AD%97#7_%E6%97%A5%E6%9C%AC%E6%B1%89%E5%AD%97%E7%9A%84%E6%B1%89%E8%AF%AD%E6%99%AE%E9%80%9A%E8%AF%9D%E8%A7%84%E8%8C%83%E8%AF%BB%E9%9F%B3%E8%A1%A8) [unihan]: http://www.unicode.org/charts/unihan.html diff --git a/kMandarin_overwrite.txt b/kMandarin_overwrite.txt index a882f5a..4c0d9bc 100644 --- a/kMandarin_overwrite.txt +++ b/kMandarin_overwrite.txt @@ -62,37 +62,3 @@ U+295F5: zhēng # 𩗵 U+29B5D: wǒ # 𩭝 U+2A048: zhuāng # 𪁈 U+2A2A2: shí # 𪊢 - -# 日本汉字读音 -U+5302: xiōng,yún # 匂, yún 为日本汉字读音; xiōng 为现代汉语读音; -U+4E3C: jǐng,dǎn # 丼, dǎn 为日本汉字读音; jǐng 为现代汉语读音; -U+8FBB: shí # 辻 -U+8FBC: rù # 込 -U+51E7: jīn # 凧 -U+6763: shān # 杣 -U+67A0: zá # 枠 -U+7551: tián # 畑 -U+6803: lì # 栃 -U+6802: méi # 栂 -U+5CE0: kǎ # 峠 -U+4FE3: yǔ # 俣 -U+7C7E: rèn # 籾 -U+7560: tián # 畠 -U+96EB: xià # 雫 -U+7B39: shì # 笹 -U+5840: píng # 塀 -U+6919: chāng # 椙 -U+7872: yù # 硲 -U+86EF: lǎo # 蛯 -U+55B0: cān # 喰 -U+643E: zhà # 搾 -U+698A: shén # 榊 -U+50CD: dòng # 働 -U+7CC0: huā # 糀 -U+9786: bǐng # 鞆 -U+69C7: zhēn # 槇 -U+6A2B: jīan # 樫 -U+9D2B: tián # 鴫 -U+567A: xīn # 噺 -U+7C17: liáng # 簗 -U+9EBF: mó # 麿 diff --git a/kanji.txt b/kanji.txt new file mode 100644 index 0000000..807cebd --- /dev/null +++ b/kanji.txt @@ -0,0 +1,32 @@ +U+5302: yún # 匂 yún 为日本汉字读音; xiōng 为现代汉语读音; +U+4E3C: dǎn # 丼 dǎn 为日本汉字读音; jǐng 为现代汉语读音; +U+8FBB: shí # 辻 +U+8FBC: rù # 込 +U+51E7: jīn # 凧 +U+6763: shān # 杣 +U+67A0: zá # 枠 +U+7551: tián # 畑 +U+6803: lì # 栃 +U+6802: méi # 栂 +U+5CE0: kǎ # 峠 +U+4FE3: yǔ # 俣 +U+7C7E: rèn # 籾 +U+7560: tián # 畠 +U+96EB: xià # 雫 +U+7B39: shì # 笹 +U+5840: píng # 塀 +U+6919: chāng # 椙 +U+7872: yù # 硲 +U+86EF: lǎo # 蛯 +U+55B0: cān # 喰 +U+643E: zhà # 搾 +U+698A: shén # 榊 +U+50CD: dòng # 働 +U+7CC0: huā # 糀 +U+9786: bǐng # 鞆 +U+69C7: zhēn # 槇 +U+6A2B: jiān # 樫 +U+9D2B: tián # 鴫 +U+567A: xīn # 噺 +U+7C17: liáng # 簗 +U+9EBF: mó # 麿 diff --git a/merge_unihan.py b/merge_unihan.py index 08dbfd3..89c4319 100644 --- a/merge_unihan.py +++ b/merge_unihan.py @@ -93,6 +93,9 @@ def extend_pinyins(old_map, new_map, only_no_exists=False): with open('GBK_PUA.txt') as fp: pua_pinyin_map = parse_pinyins(fp) extend_pinyins(raw_pinyin_map, pua_pinyin_map) + with open('kanji.txt') as fp: + _map = parse_pinyins(fp) + extend_pinyins(raw_pinyin_map, _map, only_no_exists=True) with open('overwrite.txt') as fp: overwrite_pinyin_map = parse_pinyins(fp) diff --git a/overwrite.txt b/overwrite.txt index 740e220..1233c6f 100644 --- a/overwrite.txt +++ b/overwrite.txt @@ -40,3 +40,5 @@ U+E864: luán #  U+241FE: yíng # 𤇾 U+275C8: nú # 𧗈 U+47C1: xiāo,chāo # 䟁 +U+9EBF: mí # 麿 +U+7C17: zhù # 簗 diff --git a/pinyin.txt b/pinyin.txt index 74ce997..79b13c1 100644 --- a/pinyin.txt +++ b/pinyin.txt @@ -17554,7 +17554,7 @@ U+7C13: diāo # 簓 U+7C14: suō # 簔 U+7C15: lè # 簕 U+7C16: duàn # 簖 -U+7C17: liang # 簗 +U+7C17: zhù # 簗 U+7C18: xiāo # 簘 U+7C19: bó # 簙 U+7C1A: mì # 簚 @@ -26426,7 +26426,7 @@ U+9EBB: má,mā # 麻 U+9EBC: me # 麼 U+9EBD: mó,má,ma,me # 麽 U+9EBE: huī # 麾 -U+9EBF: mo # 麿 +U+9EBF: mí # 麿 U+9EC0: zōu # 黀 U+9EC1: nún # 黁 U+9EC2: fén # 黂 From 70940bb8f37a1dcad0fbfa2709e23690835eae87 Mon Sep 17 00:00:00 2001 From: mozillazg Date: Sat, 1 Jun 2019 17:40:57 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E5=8E=BB=E6=8E=89=E5=87=A0=E4=B8=AA?= =?UTF-8?q?=E6=9C=89=E8=AF=AF=E7=9A=84=E8=BD=BB=E5=A3=B0=E6=95=B0=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + overwrite.txt | 3 +++ pinyin.txt | 6 +++--- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38b5a29..c0fde10 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## [0.8.0] (2019-06-01) * 增加 `kanji.txt` 日本自造汉字的拼音数据 via [#32]. Thanks [@LuoZijun](https://github.com/LuoZijun) +* 去掉几个有误的轻声数据 ## [0.7.0] (2019-03-31) diff --git a/overwrite.txt b/overwrite.txt index 1233c6f..88f88cc 100644 --- a/overwrite.txt +++ b/overwrite.txt @@ -42,3 +42,6 @@ U+275C8: nú # 𧗈 U+47C1: xiāo,chāo # 䟁 U+9EBF: mí # 麿 U+7C17: zhù # 簗 +U+8279: cǎo # 艹 +U+88CF: lǐ # 裏 +U+88E1: lǐ # 裡 diff --git a/pinyin.txt b/pinyin.txt index 79b13c1..9597986 100644 --- a/pinyin.txt +++ b/pinyin.txt @@ -19188,7 +19188,7 @@ U+8275: pīng # 艵 U+8276: yàn # 艶 U+8277: yàn # 艷 U+8278: cǎo # 艸 -U+8279: cao # 艹 +U+8279: cǎo # 艹 U+827A: yì # 艺 U+827B: lè,jí # 艻 U+827C: tīng,dǐng # 艼 @@ -20810,7 +20810,7 @@ U+88CB: shù # 裋 U+88CC: jiá,jiā,xié # 裌 U+88CD: kǔn # 裍 U+88CE: chéng,chěng # 裎 -U+88CF: lǐ,li # 裏 +U+88CF: lǐ # 裏 U+88D0: juān # 裐 U+88D1: shēn # 裑 U+88D2: póu,bāo # 裒 @@ -20828,7 +20828,7 @@ U+88DD: zhuāng # 裝 U+88DE: shuì # 裞 U+88DF: shā # 裟 U+88E0: qún # 裠 -U+88E1: lǐ,li # 裡 +U+88E1: lǐ # 裡 U+88E2: lián,shāo # 裢 U+88E3: liǎn # 裣 U+88E4: kù # 裤