From 317e40fecc44e71b0ec7bb5be2f6889a72a43bc6 Mon Sep 17 00:00:00 2001 From: Tom Denton Date: Thu, 3 Aug 2023 15:37:42 -0700 Subject: [PATCH] Add Weldy's calltype dataset and Anuraset. PiperOrigin-RevId: 553610183 --- chirp/data/soundscapes/dataset_fns.py | 50 +++ chirp/data/soundscapes/soundscapes.py | 24 ++ chirp/taxonomy/namespace_db.py | 19 +- chirp/taxonomy/taxonomy_database.json | 450 ++++++++++++++++++++++++ chirp/tests/testdata/anuraset.csv | 4 + chirp/tests/testdata/weldy_calltype.csv | 5 + 6 files changed, 547 insertions(+), 5 deletions(-) create mode 100644 chirp/tests/testdata/anuraset.csv create mode 100644 chirp/tests/testdata/weldy_calltype.csv diff --git a/chirp/data/soundscapes/dataset_fns.py b/chirp/data/soundscapes/dataset_fns.py index 6302a9e4..9fd4b998 100644 --- a/chirp/data/soundscapes/dataset_fns.py +++ b/chirp/data/soundscapes/dataset_fns.py @@ -209,3 +209,53 @@ def load_powdermill_annotations(annotations_path: epath.Path) -> pd.DataFrame: ) segments = annotations.annotations_to_dataframe(annos) return segments + + +def load_weldy_annotations(annotations_path: epath.Path) -> pd.DataFrame: + """Loads a dataframe of all annotations from the Weldy Calltype dataset.""" + filename_fn = lambda _, row: 'annotated_recordings/' + row['file'].strip() + start_time_fn = lambda row: float(row['start']) + end_time_fn = lambda row: float(row['end']) + filter_fn = lambda row: False + class_fn = lambda row: ( # pylint: disable=g-long-lambda + row['label'] + .replace('unk', 'unknown') + .replace('impossible', 'unknown') + .replace('unknown_chip', 'unknown') + .split(' ') + ) + annos = annotations.read_dataset_annotations_csvs( + [epath.Path(annotations_path)], + filename_fn=filename_fn, + namespace='weldy_calltype', + class_fn=class_fn, + start_time_fn=start_time_fn, + end_time_fn=end_time_fn, + filter_fn=filter_fn, + ) + segments = annotations.annotations_to_dataframe(annos) + return segments + + +def load_anuraset_annotations(annotations_path: epath.Path) -> pd.DataFrame: + """Loads a dataframe of all annotations.""" + filename_fn = lambda _, row: os.path.join( # pylint: disable=g-long-lambda + row['filename'].split('_')[0], row['filename'].strip() + ) + start_time_fn = lambda row: float(row['start_time_s']) + end_time_fn = lambda row: float(row['end_time_s']) + # There are a few SPECIES_LALSE labels which according to the authors should + # be ignored. + filter_fn = lambda row: '_LALSE' in row['label'] + class_fn = lambda row: row['label'].split(' ') + annos = annotations.read_dataset_annotations_csvs( + [epath.Path(annotations_path)], + filename_fn=filename_fn, + namespace='anuraset', + class_fn=class_fn, + start_time_fn=start_time_fn, + end_time_fn=end_time_fn, + filter_fn=filter_fn, + ) + segments = annotations.annotations_to_dataframe(annos) + return segments diff --git a/chirp/data/soundscapes/soundscapes.py b/chirp/data/soundscapes/soundscapes.py index 9129006a..74a098e1 100644 --- a/chirp/data/soundscapes/soundscapes.py +++ b/chirp/data/soundscapes/soundscapes.py @@ -335,6 +335,30 @@ class Soundscapes(bird_taxonomy.BirdTaxonomy): ), class_list_name='peru', ), + SoundscapesConfig( + name='weldy_calltype_full_length', + audio_glob='weldy_calltype/annotated_recordings/*.wav', + annotation_load_fn=dataset_fns.load_weldy_annotations, + annotation_filename='annotations.csv', + keep_unknown_annotation=True, + description=( + 'Full-length annotated bird call types from the PNW. ' + 'https://zenodo.org/record/8047850' + ), + class_list_name='weldy_calltype', + ), + SoundscapesConfig( + name='anuraset_full_length', + audio_glob='anuraset/raw_data/*/*.wav', + annotation_load_fn=dataset_fns.load_anuraset_annotations, + annotation_filename='annotations.csv', + keep_unknown_annotation=True, + description=( + 'Full-length annotated frog vocalizations. ' + 'https://zenodo.org/record/8056090' + ), + class_list_name='anuraset', + ), ] def _info(self) -> tfds.core.DatasetInfo: diff --git a/chirp/taxonomy/namespace_db.py b/chirp/taxonomy/namespace_db.py index 77433f58..11b1def0 100644 --- a/chirp/taxonomy/namespace_db.py +++ b/chirp/taxonomy/namespace_db.py @@ -48,24 +48,33 @@ def validate_taxonomy_database(taxonomy_database: TaxonomyDatabase) -> None: """ namespaces = taxonomy_database.namespaces - for _, mapping in taxonomy_database.mappings.items(): + for mapping_name, mapping in taxonomy_database.mappings.items(): if ( set(mapping.mapped_pairs.keys()) - namespaces[mapping.source_namespace].classes ): - raise ValueError("unknown class in source") + raise ValueError( + f"Mapping {mapping_name} contains a source class not in " + f"the namespace ({mapping.source_namespace})." + ) if ( set(mapping.mapped_pairs.values()) - namespaces[mapping.target_namespace].classes ): - raise ValueError("unknown class in target") + raise ValueError( + f"Mapping {mapping_name} contains a target class not in " + f"the namespace ({mapping.source_namespace})." + ) - for _, class_list in taxonomy_database.class_lists.items(): + for class_name, class_list in taxonomy_database.class_lists.items(): classes = class_list.classes if set(classes) - namespaces[class_list.namespace].classes > { namespace.UNKNOWN_LABEL }: - raise ValueError("unknown class in class list") + raise ValueError( + f"ClassList {class_name} contains a class not in " + f"the namespace ({class_list.namespace})." + ) def load_taxonomy_database( diff --git a/chirp/taxonomy/taxonomy_database.json b/chirp/taxonomy/taxonomy_database.json index 0e261cc3..f416ee28 100644 --- a/chirp/taxonomy/taxonomy_database.json +++ b/chirp/taxonomy/taxonomy_database.json @@ -64257,6 +64257,99 @@ } }, "mappings": { + "anuraset_to_anuraset_species": { + "mapped_pairs": { + "ADEDIP_L": "ADEDIP", + "ADEDIP_M": "ADEDIP", + "ADEMAR_L": "ADEMAR", + "ADEMAR_M": "ADEMAR", + "AMEPIC_L": "AMEPIC", + "BOAALB_L": "BOAALB", + "BOAALB_M": "BOAALB", + "BOAALM_L": "BOAALM", + "BOAALM_M": "BOAALM", + "BOABIS_L": "BOABIS", + "BOABIS_M": "BOABIS", + "BOAFAB_H": "BOAFAB", + "BOAFAB_L": "BOAFAB", + "BOAFAB_M": "BOAFAB", + "BOALEP_L": "BOALEP", + "BOALEP_M": "BOALEP", + "BOALUN_H": "BOALUN", + "BOALUN_L": "BOALUN", + "BOALUN_M": "BOALUN", + "BOAPRA_L": "BOAPRA", + "BOAPRA_M": "BOAPRA", + "BOARAN_L": "BOARAN", + "BOARAN_M": "BOARAN", + "DENCRU_L": "DENCRU", + "DENCRU_M": "DENCRU", + "DENELE_L": "DENELE", + "DENMIN_H": "DENMIN", + "DENMIN_L": "DENMIN", + "DENMIN_M": "DENMIN", + "DENNAH_L": "DENNAH", + "DENNAH_M": "DENNAH", + "DENNAN_L": "DENNAN", + "DENNAN_M": "DENNAN", + "ELABIC_L": "ELABIC", + "ELABIC_M": "ELABIC", + "ELAMAT_L": "ELAMAT", + "ELAMAT_M": "ELAMAT", + "LEPELE_L": "LEPELE", + "LEPFLA_L": "LEPFLA", + "LEPFUS_H": "LEPFUS", + "LEPFUS_L": "LEPFUS", + "LEPFUS_M": "LEPFUS", + "LEPLAB_H": "LEPLAB", + "LEPLAB_L": "LEPLAB", + "LEPLAB_M": "LEPLAB", + "LEPLAT_L": "LEPLAT", + "LEPLAT_M": "LEPLAT", + "LEPNOT_L": "LEPNOT", + "LEPNOT_M": "LEPNOT", + "LEPPOD_H": "LEPPOD", + "LEPPOD_L": "LEPPOD", + "LEPPOD_M": "LEPPOD", + "PHYALB_H": "PHYALB", + "PHYALB_L": "PHYALB", + "PHYALB_M": "PHYALB", + "PHYCUV_L": "PHYCUV", + "PHYCUV_M": "PHYCUV", + "PHYDIS_L": "PHYDIS", + "PHYDIS_M": "PHYDIS", + "PHYMAR_H": "PHYMAR", + "PHYMAR_L": "PHYMAR", + "PHYMAR_M": "PHYMAR", + "PHYNAT_L": "PHYNAT", + "PHYNAT_M": "PHYNAT", + "PHYSAU_H": "PHYSAU", + "PHYSAU_L": "PHYSAU", + "PHYSAU_M": "PHYSAU", + "PITAZU_L": "PITAZU", + "PITAZU_M": "PITAZU", + "RHIICT_L": "RHIICT", + "RHIICT_M": "RHIICT", + "RHIORN_L": "RHIORN", + "RHIORN_M": "RHIORN", + "RHISCI_M": "RHISCI", + "SCIALT_L": "SCIALT", + "SCIFUS_L": "SCIFUS", + "SCIFUS_M": "SCIFUS", + "SCIFUV_L": "SCIFUV", + "SCIFUV_M": "SCIFUV", + "SCINAS_L": "SCINAS", + "SCIPER_L": "SCIPER", + "SCIPER_M": "SCIPER", + "SCIRIZ_L": "SCIRIZ", + "SCIRIZ_M": "SCIRIZ", + "SPHSUR_H": "SPHSUR", + "SPHSUR_L": "SPHSUR", + "SPHSUR_M": "SPHSUR" + }, + "source_namespace": "anuraset", + "target_namespace": "anuraset_species" + }, "ebird2021_species_to_family": { "mapped_pairs": { "aakspa1": "passeridae", @@ -270712,6 +270805,107 @@ "source_namespace": "ioc_12_2", "target_namespace": "ebird2021" }, + "weldy_calltype_to_ebird2021": { + "mapped_pairs": { + "amegfi_call_1": "amegfi", + "amegfi_call_2": "amegfi", + "amerob_call_1": "amerob", + "amerob_call_2": "amerob", + "amerob_song_1": "amerob", + "batpig1_call_1": "batpig1", + "batpig1_song_1": "batpig1", + "bewwre_call_3": "bewwre", + "bkcchi_song_1": "bkcchi", + "bkhgro_call_1": "bkhgro", + "bkhgro_song_1": "bkhgro", + "brncre_call_1": "brncre", + "brncre_song_1": "brncre", + "btywar_song_1": "btywar", + "bushti_call_1": "bushti", + "casvir_song_1": "casvir", + "cedwax_call_1": "cedwax", + "cedwax_call_2": "cedwax", + "chbchi_call_1": "chbchi", + "chbchi_call_3": "chbchi", + "chispa_song_1": "chispa", + "comrav_call_1": "comrav", + "coohaw_call_1": "coohaw", + "daejun_call_1": "daejun", + "daejun_call_2": "daejun", + "daejun_song_1": "daejun", + "dowwoo_call_1": "dowwoo", + "dusfly_song_1": "dusfly", + "evegro_call_1": "evegro", + "evegro_call_2": "evegro", + "gockin_call_1": "gockin", + "gockin_song_1": "gockin", + "gryjay_call_1": "gryjay", + "haiwoo_call_1": "haiwoo", + "hamfly_call_1": "hamfly", + "hamfly_song_1": "hamfly", + "herthr_call_2": "herthr", + "herthr_song_1": "herthr", + "herwar_song_1": "herwar", + "hutvir_song_1": "hutvir", + "lazbun_call_2": "lazbun", + "macwar_song_1": "macwar", + "mouchi_call_1": "mouchi", + "mouchi_song_1": "mouchi", + "mouqua_call_1": "mouqua", + "mouqua_call_2": "mouqua", + "naswar_song_1": "naswar", + "norfli_call_1": "norfli", + "norfli_call_2": "norfli", + "olsfly_call_1": "olsfly", + "olsfly_song_1": "olsfly", + "orcwar_song_1": "orcwar", + "pacwre1_call_1": "pacwre1", + "pacwre1_call_2": "pacwre1", + "pacwre1_song_1": "pacwre1", + "pasfly_call_1": "pasfly", + "pasfly_call_2": "pasfly", + "pasfly_call_3": "pasfly", + "pasfly_song_1": "pasfly", + "pilwoo_call_1": "pilwoo", + "pilwoo_call_2": "pilwoo", + "pinsis_call_1": "pinsis", + "pinsis_call_2": "pinsis", + "pinsis_song_1": "pinsis", + "purfin_song_1": "purfin", + "rebnut_call_2": "rebnut", + "rebnut_call_3": "rebnut", + "rebnut_call_4": "rebnut", + "rebnut_song_1": "rebnut", + "redjun1_song_1": "redjun1", + "saypho_song_1": "saypho", + "soogro1_song_1": "soogro1", + "spotow_call_1": "spotow", + "spotow_song_1": "spotow", + "stejay_call_1": "stejay", + "swathr_call_1": "swathr", + "swathr_call_2": "swathr", + "swathr_call_3": "swathr", + "swathr_song_1": "swathr", + "towsol_call_1": "towsol", + "towwar_call_1": "towwar", + "towwar_song_1": "towwar", + "varthr_call_1": "varthr", + "varthr_call_2": "varthr", + "varthr_song_1": "varthr", + "warvir_song_1": "warvir", + "westan_call_1": "westan", + "westan_song_1": "westan", + "wewpew_call_1": "wewpew", + "wewpew_song_1": "wewpew", + "wiltur_song_1": "wiltur", + "wlswar_call_1": "wlswar", + "wlswar_song_1": "wlswar", + "wrenti_song_1": "wrenti", + "yerwar_song_1": "yerwar" + }, + "source_namespace": "weldy_calltype", + "target_namespace": "ebird2021" + }, "xenocanto_11_2_to_ebird2022_species": { "mapped_pairs": { "abeillia abeillei": "emchum1", @@ -282301,6 +282495,143 @@ } }, "namespaces": { + "anuraset": { + "classes": [ + "ADEDIP_L", + "ADEDIP_M", + "ADEMAR_L", + "ADEMAR_M", + "AMEPIC_L", + "BOAALB_L", + "BOAALB_M", + "BOAALM_L", + "BOAALM_M", + "BOABIS_L", + "BOABIS_M", + "BOAFAB_H", + "BOAFAB_L", + "BOAFAB_M", + "BOALEP_L", + "BOALEP_M", + "BOALUN_H", + "BOALUN_L", + "BOALUN_M", + "BOAPRA_L", + "BOAPRA_M", + "BOARAN_L", + "BOARAN_M", + "DENCRU_L", + "DENCRU_M", + "DENELE_L", + "DENMIN_H", + "DENMIN_L", + "DENMIN_M", + "DENNAH_L", + "DENNAH_M", + "DENNAN_L", + "DENNAN_M", + "ELABIC_L", + "ELABIC_M", + "ELAMAT_L", + "ELAMAT_M", + "LEPELE_L", + "LEPFLA_L", + "LEPFUS_H", + "LEPFUS_L", + "LEPFUS_M", + "LEPLAB_H", + "LEPLAB_L", + "LEPLAB_M", + "LEPLAT_L", + "LEPLAT_M", + "LEPNOT_L", + "LEPNOT_M", + "LEPPOD_H", + "LEPPOD_L", + "LEPPOD_M", + "PHYALB_H", + "PHYALB_L", + "PHYALB_M", + "PHYCUV_L", + "PHYCUV_M", + "PHYDIS_L", + "PHYDIS_M", + "PHYMAR_H", + "PHYMAR_L", + "PHYMAR_M", + "PHYNAT_L", + "PHYNAT_M", + "PHYSAU_H", + "PHYSAU_L", + "PHYSAU_M", + "PITAZU_L", + "PITAZU_M", + "RHIICT_L", + "RHIICT_M", + "RHIORN_L", + "RHIORN_M", + "RHISCI_M", + "SCIALT_L", + "SCIFUS_L", + "SCIFUS_M", + "SCIFUV_L", + "SCIFUV_M", + "SCINAS_L", + "SCIPER_L", + "SCIPER_M", + "SCIRIZ_L", + "SCIRIZ_M", + "SPHSUR_H", + "SPHSUR_L", + "SPHSUR_M" + ] + }, + "anuraset_species": { + "classes": [ + "ADEDIP", + "ADEMAR", + "AMEPIC", + "BOAALB", + "BOAALM", + "BOABIS", + "BOAFAB", + "BOALEP", + "BOALUN", + "BOAPRA", + "BOARAN", + "DENCRU", + "DENELE", + "DENMIN", + "DENNAH", + "DENNAN", + "ELABIC", + "ELAMAT", + "LEPELE", + "LEPFLA", + "LEPFUS", + "LEPLAB", + "LEPLAT", + "LEPNOT", + "LEPPOD", + "PHYALB", + "PHYCUV", + "PHYDIS", + "PHYMAR", + "PHYNAT", + "PHYSAU", + "PITAZU", + "RHIICT", + "RHIORN", + "RHISCI", + "SCIALT", + "SCIFUS", + "SCIFUV", + "SCINAS", + "SCIPER", + "SCIRIZ", + "SPHSUR" + ] + }, "audioset": { "classes": [ "/g/11b630rrvh", @@ -386494,6 +386825,125 @@ "zosterornis whiteheadi" ] }, + "weldy_calltype": { + "classes": [ + "NA_call_1", + "airplane_engine_1", + "amegfi_call_1", + "amegfi_call_2", + "amerob_call_1", + "amerob_call_2", + "amerob_song_1", + "batpig1_call_1", + "batpig1_song_1", + "bewwre_call_3", + "bkcchi_song_1", + "bkhgro_call_1", + "bkhgro_song_1", + "brncre_call_1", + "brncre_song_1", + "btywar_song_1", + "bushti_call_1", + "casvir_song_1", + "cedwax_call_1", + "cedwax_call_2", + "chainsaw_engine_1", + "chbchi_call_1", + "chbchi_call_3", + "chipmu_chirp_1", + "chispa_song_1", + "complete", + "comrav_call_1", + "coohaw_call_1", + "daejun_call_1", + "daejun_call_2", + "daejun_song_1", + "dog_bark_1", + "dousqu_chirp_1", + "dousqu_rattle_1", + "dowwoo_call_1", + "drum", + "dusfly_song_1", + "empty", + "evegro_call_1", + "evegro_call_2", + "gockin_call_1", + "gockin_song_1", + "gryjay_call_1", + "gunshot_shot_1", + "haiwoo_call_1", + "hamfly_call_1", + "hamfly_song_1", + "herthr_call_2", + "herthr_song_1", + "herwar_song_1", + "hutvir_song_1", + "insect_buzz_1", + "lazbun_call_2", + "macwar_song_1", + "mouchi_call_1", + "mouchi_song_1", + "mouqua_call_1", + "mouqua_call_2", + "naswar_song_1", + "norfli_call_1", + "norfli_call_2", + "olsfly_call_1", + "olsfly_song_1", + "orcwar_song_1", + "pacwre1_call_1", + "pacwre1_call_2", + "pacwre1_song_1", + "paruli_song_1", + "pasfly_call_1", + "pasfly_call_2", + "pasfly_call_3", + "pasfly_song_1", + "pilwoo_call_1", + "pilwoo_call_2", + "pinsis_call_1", + "pinsis_call_2", + "pinsis_song_1", + "purfin_song_1", + "rain", + "rebnut_call_2", + "rebnut_call_3", + "rebnut_call_4", + "rebnut_song_1", + "redjun1_song_1", + "saypho_song_1", + "sensor_noise_1", + "setoph_song_1", + "soogro1_song_1", + "spotow_call_1", + "spotow_song_1", + "stejay_call_1", + "swathr_call_1", + "swathr_call_2", + "swathr_call_3", + "swathr_song_1", + "towsol_call_1", + "towwar_call_1", + "towwar_song_1", + "tree_creak", + "truck_beep_1", + "varthr_call_1", + "varthr_call_2", + "varthr_song_1", + "vehicle_engine_1", + "warvir_song_1", + "westan_call_1", + "westan_song_1", + "wewpew_call_1", + "wewpew_song_1", + "wiltur_song_1", + "wingbeat", + "wlswar_call_1", + "wlswar_song_1", + "wrenti_song_1", + "yerwar_song_1" + ] + }, "xenocanto_10_1": { "classes": [ "accipiter collaris", diff --git a/chirp/tests/testdata/anuraset.csv b/chirp/tests/testdata/anuraset.csv new file mode 100644 index 00000000..b7b3569f --- /dev/null +++ b/chirp/tests/testdata/anuraset.csv @@ -0,0 +1,4 @@ +filename,start_time_s,end_time_s,label +INCT17_20191113_040000.wav,0.000000,59.988753,PHYSAU_M +INCT17_20191113_040000.wav,9.960731,10.344517,LEPFUS_M +INCT17_20191113_040000.wav,13.113683,13.302624,PITAZU_L diff --git a/chirp/tests/testdata/weldy_calltype.csv b/chirp/tests/testdata/weldy_calltype.csv new file mode 100644 index 00000000..84900e50 --- /dev/null +++ b/chirp/tests/testdata/weldy_calltype.csv @@ -0,0 +1,5 @@ +file,clip_complete,start,end,eBird_2021,label +Site_001_Rep_A.wav,TRUE,4,6,empty,empty +Site_001_Rep_A.wav,TRUE,6,8,spotow,spotow_song_1 +Site_001_Rep_A.wav,TRUE,6,8,complete,complete +Site_001_Rep_A.wav,TRUE,8,10,olsfly,olsfly_song_1