From fc42dc34ecd9fb54afafcea76ca1fb8c3da38ad4 Mon Sep 17 00:00:00 2001 From: Dominic Schuhmacher Date: Sun, 9 Jun 2024 12:09:34 +0200 Subject: [PATCH] Complete pooled similarity experiments and add kanjidist link in README --- README.md | 4 +++- data-raw/pooled_similarity_benchmark.Rmd | 21 +++++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ade1967..9e2ac32 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,9 @@ [![R-CMD-check](https://github.com/dschuhmacher/kanjistat/actions/workflows/check-standard.yaml/badge.svg)](https://github.com/dschuhmacher/kanjistat/actions/workflows/check-standard.yaml) -kanjistat offers tools for processing and analyzing Japanese kanji characters. You can look up readings and meanings along with further dictionary information (to a large part provided by [KANJIDIC2](https://www.edrdg.org/wiki/index.php/KANJIDIC_Project)) including morphological details. You can plot kanji in your favorite font and process the resulting bitmap. You can represent kanji as nested lists of stroke paths (based on original data by [kanjiVG](https://kanjivg.tagaini.net/)), plot components in different color or display the tree structure, and analyze these decompositions and their individual strokes. Various old and new distance functions between kanji are implemented. +Showcase: [The Jōyō Kanji Map](https://www.kanjidist.org/) + +`kanjistat` offers tools for processing and analyzing Japanese kanji characters. You can look up readings and meanings along with further dictionary information (to a large part provided by [KANJIDIC2](https://www.edrdg.org/wiki/index.php/KANJIDIC_Project)) including morphological details. You can plot kanji in your favorite font and process the resulting bitmap. You can represent kanji as nested lists of stroke paths (based on original data by [kanjiVG](https://kanjivg.tagaini.net/)), plot components in different color or display the tree structure, and analyze these decompositions and their individual strokes. Various old and new distance functions between kanji are implemented. | ![Kanji with components in different colors](man/figures/hair_kveckanji_sm.svg) | ![Dendrogram of the kanji](man/figures/hair_kvecdend_sm.svg) | diff --git a/data-raw/pooled_similarity_benchmark.Rmd b/data-raw/pooled_similarity_benchmark.Rmd index 081abdf..01e8a2a 100644 --- a/data-raw/pooled_similarity_benchmark.Rmd +++ b/data-raw/pooled_similarity_benchmark.Rmd @@ -7,6 +7,7 @@ library(R.utils) library(yaml) library(tidyverse) library(kanjistat) +library(kanjistat.data) file_path <- "poolexp_judgements.yaml.gz" @@ -128,11 +129,27 @@ kanjidist_spec <- function(k1, k2) { kanjidist(k1, k2, compo_seg_depth1 = 4, compo_seg_depth2 = 4, approx="pcweighted", minor_warnings = FALSE) } -results <- pooled_distance_benchmark(kanjidist_spec, 100) +results <- pooled_distance_benchmark(kanjidist_spec, 312) df <- results -hist(unlist(df$jaccard), xlab="Jaccard Index, |A∩B| / |A∪B|", main="Jaccard Index of Most Similar Kanji as selected by Subjective \n Similarity Judgements vs. by the Similarity Metric") +hist(unlist(df$jaccard), freq=FALSE, xlab="Jaccard Index, |A∩B| / |A∪B|", main="Jaccard Index of Most Similar Kanji as selected by Subjective \n Similarity Judgements vs. by kanjidist") +cat("kanjidist:\n") +table(unlist(df$jaccard)) + +sedist_spec <- function(k1,k2) { + l1 <- convert_kanji(k1, output="character") + l2 <- convert_kanji(k2, output="character") + sedist(l1,l2) +} + +results <- pooled_distance_benchmark(sedist_spec, 312) + +df <- results + +hist(unlist(df$jaccard), freq=FALSE, xlab="Jaccard Index, |A∩B| / |A∪B|", main="Jaccard Index of Most Similar Kanji as selected by Subjective \n Similarity Judgements vs. by stroke edit dist") +cat("sedist:\n") +table(unlist(df$jaccard)) ``` Next, we load data from [Yencken and Baldwin (2006)](https://lars.yencken.org/papers/iccpol-2006.pdf).