ropensci · kirillkoncha · Dec 7, 2021
diff --git a/NAMESPACE b/NAMESPACE
@@ -34,6 +34,7 @@ export(url.lang)
 export(valpal.feature)
 export(vanuatu.feature)
 export(wals.feature)
+export(wcs.feature)
 importFrom(grDevices,gray)
 importFrom(grDevices,topo.colors)
 importFrom(jsonlite,fromJSON)

diff --git a/R/wcs.R b/R/wcs.R
@@ -0,0 +1,30 @@
+#' WCS data
+#'
+#' Data from The database of WCS Data Archives (\url{https://www1.icsi.berkeley.edu/wcs/data.html}). This dataset is created for \code{\link{wcs.feature}} function.
+#'
+#' @format A data frame with 112408 rows and 19 variables:
+#' \describe{
+#'   \item{grid_coordinates}{Grid coordinates}
+#'   \item{language_id}{Language id}
+#'   \item{speaker_id}{Speaker id}
+#'   \item{term_abbr}{Color term abbriviation}
+#'   \item{language_original}{Language name in the original dataset}
+#'   \item{language}{Language name in lingtypology}
+#'   \item{family}{Language family}
+#'   \item{iso}{ISO code}
+#'   \item{glottocode}{Glottocode}
+#'   \item{location}{Location}
+#'   \item{fieldworkers}{Fieldworkers}
+#'   \item{longitude}{Longitude}
+#'   \item{latitude}{Latitude}
+#'   \item{focus_response}{Focus response}
+#'   \item{term_id}{Color term id}
+#'   \item{term_transcription}{Color term transcription}
+#'   \item{speakers_age}{Speakers age}
+#'   \item{speakers_sex}{Speakers sex}
+#'   \item{hex_code}{Hex codes of colors from WCS Mapping Table (\url{https://www1.icsi.berkeley.edu/wcs/images/jrus-20100531/wcs-chart-4x.png})
+#' }
+#
+
+"wcs"
+
diff --git a/R/wcs.feature.R b/R/wcs.feature.R
@@ -0,0 +1,29 @@
+#' Opens data from the database of WCS Data Archives
+#'
+#' This function opens downloaded data from the database of WCS Data Archives (\url{https://www1.icsi.berkeley.edu/wcs/data.html}).
+#'
+#' @param feature "Berlin and Kay" to show data collected by Berlin and Kay; "WCS" for World Color Survey Database. Both databases were created by asking native speakers of different languages to name each of 330 Munsell chips, shown in a constant, random order, and (2), exposed to a palette of these chips and asked to to pick out the best example(s) ("foci") of the major terms elicited in the naming task. Two databases are differ in number of languages as WCS Database is significantly larger.
+#' @author Kirill Koncha <[email protected]>
+#' @seealso \code{\link{afbo.feature}}, \code{\link{autotyp.feature}}, \code{\link{oto_mangueanIC.feature}}, \code{\link{phoible.feature}}, \code{\link{sails.feature}}, \code{\link{valpal.feature}}, \code{\link{wals.feature}}
+#' @examples
+#'
+#' wcs.feature("WCS")
+#'
+#' @export
+#'
+#'
+wcs.feature <- function(feature){
+  message("Don't forget to cite a source:
+
+Richard Cook, Paul Kay, Terry Regier. WCS Data Archives (Available online at http://www.icsi.berkeley.edu/wcs/data.html, Accessed on ...)")
+  if (feature == "Berlin and Kay") {
+    result <- lingtypology::wcs_bk
+    return(result)
+  } else if (feature == "WCS") {
+    result <- lingtypology::wcs
+    return(result)
+  } else {
+    message('It seems that function does not have this parameter. Use "WCS" to get World Color Survey Database and "Berlin and Kay" to get original data collected by Berlin and Kay.')
+  }
+}
+
diff --git a/R/wcs_bk.R b/R/wcs_bk.R
@@ -0,0 +1,23 @@
+#' WCS data
+#'
+#' Data from The database of WCS Data Archives (\url{https://www1.icsi.berkeley.edu/wcs/data.html}). This dataset is created for \code{\link{wcs.feature}} function.
+#'
+#' @format A data frame with 32211 rows and 13 variables:
+#' \describe{
+#'   \item{language_id}{Language id}
+#'   \item{speaker_id}{Speaker id}
+#'   \item{term_abbr}{Color term abbriviation}
+#'   \item{language_original}{Language name in the original dataset}
+#'   \item{language}{Language name in lingtypology}
+#'   \item{iso}{ISO code}
+#'   \item{glottocode}{Glottocode}
+#'   \item{longitude}{Longitude}
+#'   \item{latitude}{Latitude}
+#'   \item{focus_response}{Focus response}
+#'   \item{grid_coordinates}{Grid coordinates}
+#'   \item{term_id}{Color term id}
+#'   \item{term_transcription}{Color term transcription}
+#' }
+#
+
+"wcs_bk"
diff --git a/data/wcs.RData b/data/wcs.RData
diff --git a/data/wcs_bk.RData b/data/wcs_bk.RData
diff --git a/database_creation/wcs_database_creation.R b/database_creation/wcs_database_creation.R
@@ -0,0 +1,151 @@
+library(tidyverse)
+library(lingtypology)
+lang <- utils::read.delim("http://www1.icsi.berkeley.edu/wcs/data/20021219/txt/lang.txt",
+                          encoding="UTF-8", header = FALSE)
+terms <- utils::read.delim("http://www1.icsi.berkeley.edu/wcs/data/20041016/txt/dict.txt",
+                           encoding = "UTF-8")
+foci <- utils::read.delim("https://www1.icsi.berkeley.edu/wcs/data/20030414/txt/foci-exp.txt",
+                          encoding="UTF-8", header = FALSE)
+
+names(foci)[names(foci) == "V1"] <- "id"
+names(foci)[names(foci) == "V2"] <- "speaker_id"
+names(foci)[names(foci) == "V3"] <- "focus_response"
+names(foci)[names(foci) == "V4"] <- "WCSC"
+names(foci)[names(foci) == "V5"] <- "grid_coordinates"
+foci <- foci[c("id", "speaker_id", "focus_response", "WCSC", "grid_coordinates")]
+names(terms)[names(terms) == "X.LNUM"] <- "id"
+names(lang)[names(lang) == "V1"] <- "id"
+#Вы сказали не хранить этот файл в пакете. Его уже нет, он удален, ну ладно пусть путь такой будет
+codes <- read.csv("database_creation/wcs_codes.csv", sep = ";")
+names(codes)[names(codes) == "Index"] <- "id"
+codes$ISO.639.3.Code[codes$id == 93] <- "tac"
+
+lang <- lang %>%
+  mutate_all(funs(str_replace(., "\\{\\\\x87\\}", "a"))) %>%
+  mutate_all(funs(str_replace(., "\\{\\\\x96\\}", "n"))) %>%
+  mutate_all(funs(str_replace(., "\\{\\\\x9C\\}", "u"))) %>%
+  mutate_all(funs(str_replace(., "\\{\\\\x8B\\}", "a"))) %>%
+  mutate_all(funs(str_replace(., "\\{\\\\x97\\}", "o"))) %>%
+  mutate_all(funs(str_replace(., "\\{\\\\x8A\\}", "a")))
+
+lang[lang == '*'] <- NA
+
+for (i in 1: 110){
+  uni <- unique(c(lang$V4[i], lang$V5[i], lang$V5[i]))
+  lang$fieldworkers[i] <- toString(uni)}
+
+lang$fieldworkers <- gsub("NA, Jason D. Patent", "Jason D.Patent", lang$fieldworkers)
+
+lang <- merge(lang, codes, by="id")
+
+lang$glottocode <- lingtypology::gltc.iso(codes$ISO.639.3.Code)
+
+lang[lang == 'NA'] <- NA
+
+names(lang)[names(lang) == "V2"] <- "language_original"
+names(lang)[names(lang) == "Family"] <- "family"
+names(lang)[names(lang) == "Country.Where"] <- "location"
+names(lang)[names(lang) == "ISO.639.3.Code"] <- "iso"
+
+lang$language <- lingtypology::lang.gltc(lang$glottocode)
+lang <- lang[c("id", "language_original", "language", "family",  "iso", "glottocode", "location", "fieldworkers")]
+lang$longitude <- lingtypology::long.lang(lang$language)
+lang$latitude <- lingtypology::lat.lang(lang$language)
+
+lang$glottocode[lang$id == 93] <- lingtypology::gltc.iso("tac")
+lang$language[lang$id == 93] <- lingtypology::lang.iso("tac")
+lang$longitude <- lingtypology::long.lang(lang$language)
+lang$latitude <- lingtypology::lat.lang(lang$language)
+
+foci <- merge(foci, terms, by=c("id", "WCSC"))
+
+lang <- merge(lang, foci, by=c("id"))
+
+speakers <- utils::read.delim("http://www1.icsi.berkeley.edu/wcs/data/20100912/spkr-lsas.txt",
+                              encoding="UTF-8", header = FALSE)
+names(speakers)[names(speakers) == "V1"] <- "id"
+names(speakers)[names(speakers) == "V2"] <- "speaker_id"
+names(speakers)[names(speakers) == "V3"] <- "speakers_age"
+names(speakers)[names(speakers) == "V4"] <- "speakers_sex"
+
+lang <- merge(lang, speakers, by=c("id", "speaker_id"))
+names(lang)[names(lang) == "TNUM"] <- "term_id"
+names(lang)[names(lang) == "WCSC"] <- "term_abbr"
+names(lang)[names(lang) == "TRAN"] <- "term_transcription"
+
+chip_range <- utils::read.delim("http://www1.icsi.berkeley.edu/wcs/data/20021219/txt/term.txt",
+                               encoding="UTF-8", header = FALSE)
+names(chip_range)[names(chip_range) == "V1"] <- "id"
+names(chip_range)[names(chip_range) == "V2"] <- "speaker_id"
+names(chip_range)[names(chip_range) == "V3"] <- "chip_range"
+names(chip_range)[names(chip_range) == "V4"] <- "term_abbr"
+
+chip_range <- subset(chip_range, select = -chip_range)
+lang <- merge(lang, chip_range, by=c("id", "speaker_id", "term_abbr"))
+lang <- lang[!duplicated(lang), ]
+names(lang)[names(lang) == "id"] <- "language_id"
+rownames(lang) <- 1:nrow(lang)
+lang$term_abbr[is.na(lang$term_abbr)] <- "NA"
+colors <- utils::read.csv("/Users/kirillkonca/lingtypology/database_creation/colors.csv",
+                                encoding="UTF-8", header = TRUE)
+lang <- merge(lang, colors, by=c("grid_coordinates"))
+lang %>%
+  mutate(language = str_replace(language, "Huastec, San Luís Potosí", "Huastec")) -> lang
+
+id <- c(1:20)
+language_original <- c("Arabic", "Bahasa Indonesia", "Bulgarian", "Cantonese", "Catalan",
+"English (American)", "Hebrew", "Hungarian", "Ibibo", "Japanese", "Korean", "Mandarin",
+"Mexican Spanish", "Pomo", "Swahili", "Tagalog", "Thai", "Tzeltal", "Urdu", "Vietnamese")
+iso <- c("arb", "ind", "bul", "yue", "cat", "eng", "heb", "hun", "ibb",
+            "jpn", "kor", "cmn", "spa", "pmm", "swh", "tgl", "tha", "tzh", "urd",
+            "vie")
+
+bk_langs <- data.frame(id, language_original, iso)
+bk_langs$glottocode <- lingtypology::gltc.iso(bk_langs$iso)
+bk_langs$language <- lingtypology::lang.gltc(bk_langs$glottocode)
+bk_langs$longitude <- lingtypology::long.lang(bk_langs$language)
+bk_langs$latitude <- lingtypology::lat.lang(bk_langs$language)
+bk_langs$longitude[bk_langs$language_original == "English (American)"] <-
+  39.10210731519252
+bk_langs$latitude[bk_langs$language_original == "English (American)"] <-
+  -102.09603054800091
+bk_langs$longitude[bk_langs$language_original == "Mexican Spanish"] <-
+  26.43076396738642
+bk_langs$latitude[bk_langs$language_original == "Mexican Spanish"] <-
+  -104.80470289339866
+bk_terms <- utils::read.delim("http://www1.icsi.berkeley.edu/wcs/data/berlin-kay/BK-term.txt",
+                           encoding="UTF-8", header = FALSE)
+
+names(bk_terms)[names(bk_terms) == "V1"] <- "id"
+names(bk_terms)[names(bk_terms) == "V2"] <- "speaker_id"
+names(bk_terms)[names(bk_terms) == "V3"] <- "chip_range"
+names(bk_terms)[names(bk_terms) == "V4"] <- "term_abbr"
+
+bk_terms <- bk_terms[c("id", "speaker_id", "term_abbr")]
+bk_langs <- merge(bk_langs, bk_terms, by = "id")
+
+bk_dict <- utils::read.delim("http://www1.icsi.berkeley.edu/wcs/data/berlin-kay/BK-dict.txt",
+                             encoding="UTF-8")
+
+names(bk_dict)[names(bk_dict) == "X.lnum"] <- "id"
+names(bk_dict)[names(bk_dict) == "tnum"] <- "term_id"
+names(bk_dict)[names(bk_dict) == "abbr"] <- "term_abbr"
+names(bk_dict)[names(bk_dict) == "term"] <- "term_transcription"
+
+bk_langs <- merge(bk_langs, bk_dict, by = c("id", "term_abbr"))
+
+bk_foci <- utils::read.delim("http://www1.icsi.berkeley.edu/wcs/data/berlin-kay/BK-foci.txt",
+                             encoding="UTF-8", header=TRUE)
+
+names(bk_foci)[names(bk_foci) == "X1"] <- "id"
+names(bk_foci)[names(bk_foci) == "X1.1"] <- "speaker_id"
+names(bk_foci)[names(bk_foci) == "X1.2"] <- "focus_response"
+names(bk_foci)[names(bk_foci) == "J0"] <- "grid_coordinatesd <"
+
+bk_langs <- merge(bk_langs, bk_foci, by=c("id", "speaker_id"))
+names(bk_langs)[names(bk_langs) == "id"] <- "language_id"
+wcs <- lang
+wcs_bk <- bk_langs
+
+save(wcs, file="data/wcs.RData", compress= 'xz')
+save(wcs_bk, file="data/wcs_bk.RData", compress= 'xz')
diff --git a/man/wcs.Rd b/man/wcs.Rd
diff --git a/man/wcs.feature.Rd b/man/wcs.feature.Rd
diff --git a/man/wcs_bk.Rd b/man/wcs_bk.Rd
diff --git a/tests/testthat/test-wcs-feature.R b/tests/testthat/test-wcs-feature.R
@@ -0,0 +1,6 @@
+library(testthat)
+context("Tests for wcs.feature function")
+
+test_that("wcs.feature",  {
+  expect_length(wcs.feature("WCS"), 19)
+  expect_length(wcs.feature("Berlin and Kay"), 13)})