From 2aa4353a62fa2e42139f5083a7f12bcd6923dc7c Mon Sep 17 00:00:00 2001 From: "David R. Mortensen" Date: Thu, 11 Jul 2019 17:14:01 -0400 Subject: [PATCH] Corrected ordering of rules --- epitran/data/map/find_duplicates.py | 20 ++++++++++++++++++++ epitran/data/rules/pinyin-to-ipa.txt | 12 ++++++------ epitran/xsampa.py | 2 ++ setup.py | 6 +++--- 4 files changed, 31 insertions(+), 9 deletions(-) create mode 100644 epitran/data/map/find_duplicates.py diff --git a/epitran/data/map/find_duplicates.py b/epitran/data/map/find_duplicates.py new file mode 100644 index 00000000..92e62741 --- /dev/null +++ b/epitran/data/map/find_duplicates.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +import csv +import sys +from collections import defaultdict + +def main(fn): + mappings = defaultdict(list) + with open(fn, encoding='utf-8') as f: + reader = csv.reader(f) + next(reader) + for orth, phon in reader: + mappings[orth].append(phon) + print(mappings) + for orth, phons in mappings.items(): + if len(phons) > 1: + print(orth) + +if __name__ == '__main__': + main(sys.argv[1]) diff --git a/epitran/data/rules/pinyin-to-ipa.txt b/epitran/data/rules/pinyin-to-ipa.txt index 1435c838..3157d63e 100644 --- a/epitran/data/rules/pinyin-to-ipa.txt +++ b/epitran/data/rules/pinyin-to-ipa.txt @@ -1,14 +1,14 @@ ng -> ŋ / _ -b -> p / _ -d -> t / _ -g -> k / _ p -> pʰ / _ t -> tʰ / _ k -> kʰ / _ -zh -> t͡ʂ / _ +b -> p / _ +d -> t / _ +g -> k / _ +zh -> ʈ͡ʂ / _ z -> t͡s / _ j -> t͡ɕ / _ -ch -> t͡ʂʰ / _ +ch -> ʈ͡ʂʰ / _ c -> t͡sʰ / _ q -> t͡ɕʰ / _ m -> m / _ @@ -20,6 +20,7 @@ x -> ɕ / _ h -> x / _ l -> l / _ r -> ɻ / _ +eɻ -> ɻ̩ / _ w -> 0 / _ u w -> w / _ y -> 0 / _ i @@ -30,7 +31,6 @@ i -> ɻ̩ / (t͡ʂ|t͡ʂʰ|ʂ) _ a -> a / _ o -> uo / (p|pʰ|f|m) _ o -> o / _ -eɻ -> ɻ̩ / _ e -> e / _ i -> i / _ u[:] -> y / _ diff --git a/epitran/xsampa.py b/epitran/xsampa.py index 393cc0a9..1e8bd215 100644 --- a/epitran/xsampa.py +++ b/epitran/xsampa.py @@ -3,6 +3,7 @@ unicode_literals) import os.path +import unicodedata import pkg_resources @@ -54,6 +55,7 @@ def ipa2xs(self, ipa): Non-IPA segments are skipped. """ xsampa = [] + ipa = unicodedata.normalize('NFD', ipa) while ipa: token = self.longest_prefix(ipa) if token: diff --git a/setup.py b/setup.py index a4453d16..65e6fe7d 100644 --- a/setup.py +++ b/setup.py @@ -1,17 +1,17 @@ from setuptools import setup setup(name='epitran', - version='0.60', + version='0.61', description='Tools for transcribing languages into IPA.', url='http://github.com/dmort27/epitran', - download_url='https://github.com/dmort27/epitran/archive/0.60.tar.gz', + download_url='https://github.com/dmort27/epitran/archive/0.61.tar.gz', author='David R. Mortensen', author_email='dmortens@cs.cmu.edu', license='MIT', install_requires=['setuptools', 'unicodecsv', 'regex', - 'panphon>=0.12', + 'panphon>=0.15', 'marisa_trie'], extras_require={':python_version<"3.0"': ['subprocess32']}, scripts=['epitran/bin/epitranscribe.py',