Notebook updates:

* Split the single big notebook into embed_audio, agile_modeling (search+classifier building), and analysis (validation and call density estimation). * Add paged results in agile modeling notebook, which is helpful when dealing with very large result sets. * Add validation and call density code. PiperOrigin-RevId: 617919728
google-research · Mar 25, 2024 · 051debf · 051debf
1 parent ce5befa
commit 051debf
Show file tree

Hide file tree

Showing 6 changed files with 823 additions and 238 deletions.
diff --git a/agile_modeling.ipynb b/agile_modeling.ipynb
@@ -8,7 +8,9 @@
    "source": [
     "# Agile Modeling for Bioacoustics.\n",
     "\n",
-    "This notebook provides a single-machine workflow for using pre-trained models to embed raw audio files, search, and create classifiers for target signals. This notebook is ideal for a single machine with a GPU for accelarated embedding."
+    "This notebook provides a workflow for creating custom classifiers for target signals, by first **searching** for training data, and then engaging in an **active learning** loop.\n",
+    "\n",
+    "We assume that embeddings have been pre-computed using `embed.ipynb`."
    ]
   },
   {
@@ -41,10 +43,12 @@
     "colab_utils.initialize(use_tf_gpu=True, disable_warnings=True)\n",
     "\n",
     "from chirp import audio_utils\n",
+    "from chirp.inference import interface\n",
     "from chirp.inference import embed_lib\n",
     "from chirp.inference import tf_examples\n",
     "from chirp.inference import models\n",
     "from chirp.models import metrics\n",
+    "from chirp.taxonomy import namespace\n",
     "from chirp.inference.search import bootstrap\n",
     "from chirp.inference.search import search\n",
     "from chirp.inference.search import display\n",
@@ -70,6 +74,7 @@
     "# Set the embedding and labeled data directories.\n",
     "embeddings_path = epath.Path(working_dir) / 'embeddings'\n",
     "labeled_data_path = epath.Path(working_dir) / 'labeled'\n",
+    "custom_classifier_path = epath.Path(working_dir) / 'custom_classifier'\n",
     "embeddings_glob = embeddings_path / 'embeddings-*'\n",
     "\n",
     "# OPTIONAL: Set up separation model.\n",
@@ -117,171 +122,6 @@
     "  separator = None"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "_XpWruWMArWo"
-   },
-   "source": [
-    "## Embed Audio"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 0,
-   "metadata": {
-    "id": "qx-SWjFYALok"
-   },
-   "outputs": [],
-   "source": [
-    "#@title Embedding Configuration. { vertical-output: true }\n",
-    "\n",
-    "config = config_dict.ConfigDict()\n",
-    "config.embed_fn_config = config_dict.ConfigDict()\n",
-    "config.embed_fn_config.model_config = config_dict.ConfigDict()\n",
-    "\n",
-    "# IMPORTANT: Select the targe audio files.\n",
-    "# source_file_patterns should contain a list of globs of audio files, like:\n",
-    "# ['/home/me/*.wav', '/home/me/other/*.flac']\n",
-    "config.source_file_patterns = ['']  #@param\n",
-    "config.output_dir = embeddings_path.as_posix()\n",
-    "\n",
-    "# For Perch, set the perch_tfhub_model_version, and the model will load\n",
-    "# automagically from TFHub. Alternatively, set the model path for a local\n",
-    "# copy of the model.\n",
-    "# Note that only one of perch_model_path and perch_tfhub_version should be set.\n",
-    "perch_tfhub_version = 4  #@param\n",
-    "perch_model_path = ''  #@param\n",
-    "\n",
-    "# For BirdNET, point to the specific tflite file.\n",
-    "birdnet_model_path = ''  #@param\n",
-    "if model_choice == 'perch':\n",
-    "  config.embed_fn_config.model_key = 'taxonomy_model_tf'\n",
-    "  config.embed_fn_config.model_config.window_size_s = 5.0\n",
-    "  config.embed_fn_config.model_config.hop_size_s = 5.0\n",
-    "  config.embed_fn_config.model_config.sample_rate = 32000\n",
-    "  config.embed_fn_config.model_config.tfhub_version = perch_tfhub_version\n",
-    "  config.embed_fn_config.model_config.model_path = perch_model_path\n",
-    "elif model_choice == 'birdnet':\n",
-    "  config.embed_fn_config.model_key = 'birdnet'\n",
-    "  config.embed_fn_config.model_config.window_size_s = 3.0\n",
-    "  config.embed_fn_config.model_config.hop_size_s = 3.0\n",
-    "  config.embed_fn_config.model_config.sample_rate = 48000\n",
-    "  config.embed_fn_config.model_config.model_path = birdnet_model_path\n",
-    "  # Note: The v2_1 class list is appropriate for Birdnet 2.1, 2.2, and 2.3.\n",
-    "  config.embed_fn_config.model_config.class_list_name = 'birdnet_v2_1'\n",
-    "  config.embed_fn_config.model_config.num_tflite_threads = 4\n",
-    "\n",
-    "# Only write embeddings to reduce size.\n",
-    "config.embed_fn_config.write_embeddings = True\n",
-    "config.embed_fn_config.write_logits = False\n",
-    "config.embed_fn_config.write_separated_audio = False\n",
-    "config.embed_fn_config.write_raw_audio = False\n",
-    "\n",
-    "# Number of parent directories to include in the filename.\n",
-    "config.embed_fn_config.file_id_depth = 1"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 0,
-   "metadata": {
-    "id": "jb-pEadVDidv"
-   },
-   "outputs": [],
-   "source": [
-    "#@title Set up. { vertical-output: true }\n",
-    "\n",
-    "# Set up the embedding function, including loading models.\n",
-    "embed_fn = embed_lib.EmbedFn(**config.embed_fn_config)\n",
-    "print('\\n\\nLoading model(s)...')\n",
-    "embed_fn.setup()\n",
-    "\n",
-    "# Create output directory and write the configuration.\n",
-    "output_dir = epath.Path(config.output_dir)\n",
-    "output_dir.mkdir(exist_ok=True, parents=True)\n",
-    "embed_lib.maybe_write_config(config, output_dir)\n",
-    "\n",
-    "# Create SourceInfos.\n",
-    "source_infos = embed_lib.create_source_infos(\n",
-    "    config.source_file_patterns,\n",
-    "    num_shards_per_file=config.get('num_shards_per_file', -1),\n",
-    "    shard_len_s=config.get('shard_len_s', -1))\n",
-    "print(f'Found {len(source_infos)} source infos.')\n",
-    "\n",
-    "print('\\n\\nTest-run of model...')\n",
-    "window_size_s = config.embed_fn_config.model_config.window_size_s\n",
-    "sr = config.embed_fn_config.model_config.sample_rate\n",
-    "z = np.zeros([int(sr * window_size_s)])\n",
-    "embed_fn.embedding_model.embed(z)\n",
-    "print('Setup complete!')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 0,
-   "metadata": {
-    "id": "Dvnwf_LZDkBf"
-   },
-   "outputs": [],
-   "source": [
-    "#@title Run embedding. { vertical-output: true }\n",
-    "\n",
-    "# Uses multiple threads to load audio before embedding.\n",
-    "# This tends to be faster, but can fail if any audio files are corrupt.\n",
-    "\n",
-    "embed_fn.min_audio_s = 1.0\n",
-    "record_file = (output_dir / 'embeddings.tfrecord').as_posix()\n",
-    "succ, fail = 0, 0\n",
-    "\n",
-    "existing_embedding_ids = embed_lib.get_existing_source_ids(\n",
-    "    output_dir, 'embeddings-*')\n",
-    "\n",
-    "new_source_infos = embed_lib.get_new_source_infos(\n",
-    "    source_infos, existing_embedding_ids, config.embed_fn_config.file_id_depth)\n",
-    "\n",
-    "print(f'Found {len(new_source_infos)} existing embedding ids.'\n",
-    "      f'Processing {len(new_source_infos)} new source infos. ')\n",
-    "\n",
-    "audio_iterator = audio_utils.multi_load_audio_window(\n",
-    "    filepaths=[s.filepath for s in new_source_infos],\n",
-    "    offsets=[s.shard_num * s.shard_len_s for s in new_source_infos],\n",
-    "    sample_rate=config.embed_fn_config.model_config.sample_rate,\n",
-    "    window_size_s=config.get('shard_len_s', -1.0),\n",
-    ")\n",
-    "with tf_examples.EmbeddingsTFRecordMultiWriter(\n",
-    "    output_dir=output_dir, num_files=config.get('tf_record_shards', 1)) as file_writer:\n",
-    "  for source_info, audio in tqdm.tqdm(\n",
-    "      zip(new_source_infos, audio_iterator), total=len(new_source_infos)):\n",
-    "    file_id = source_info.file_id(config.embed_fn_config.file_id_depth)\n",
-    "    offset_s = source_info.shard_num * source_info.shard_len_s\n",
-    "    example = embed_fn.audio_to_example(file_id, offset_s, audio)\n",
-    "    if example is None:\n",
-    "      fail += 1\n",
-    "      continue\n",
-    "    file_writer.write(example.SerializeToString())\n",
-    "    succ += 1\n",
-    "  file_writer.flush()\n",
-    "print(f'\\n\\nSuccessfully processed {succ} source_infos, failed {fail} times.')\n",
-    "\n",
-    "fns = [fn for fn in output_dir.glob('embeddings-*')]\n",
-    "ds = tf.data.TFRecordDataset(fns)\n",
-    "parser = tf_examples.get_example_parser()\n",
-    "ds = ds.map(parser)\n",
-    "for ex in ds.as_numpy_iterator():\n",
-    "  print(ex['filename'])\n",
-    "  print(ex['embedding'].shape, flush=True)\n",
-    "  break\n",
-    "\n",
-    "# Load/refresh bootstrap_config for subsequent steps.\n",
-    "print('\\nRefreshing bootstrap_config.', flush=True)\n",
-    "bootstrap_config = bootstrap.BootstrapConfig.load_from_embedding_config(\n",
-    "    embeddings_path=embeddings_path,\n",
-    "    annotated_path=labeled_data_path)\n",
-    "\n",
-    "project_state = bootstrap.BootstrapState(bootstrap_config)\n"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -369,7 +209,7 @@
    "source": [
     "#@title Select the query channel. { vertical-output: true }\n",
     "\n",
-    "query_label = 'some_audio'  #@param\n",
+    "query_label = 'my_label'  #@param\n",
     "query_channel = 0  #@param\n",
     "\n",
     "if query_channel < 0 or sep_outputs is None:\n",
@@ -451,10 +291,18 @@
    "source": [
     "#@title Display results. { vertical-output: true }\n",
     "\n",
-    "display.display_search_results(\n",
-    "    results, sample_rate, project_state.source_map,\n",
+    "samples_per_page = 10\n",
+    "page_state = display.PageState(\n",
+    "    np.ceil(len(results.search_results) / samples_per_page))\n",
+    "\n",
+    "display.display_paged_results(\n",
+    "    results, page_state, samples_per_page,\n",
+    "    embedding_sample_rate=project_state.embedding_model.sample_rate,\n",
+    "    source_map=project_state.source_map,\n",
+    "    exclusive_labels=False,\n",
     "    checkbox_labels=[query_label, 'unknown'],\n",
-    "    max_workers=5)"
+    "    max_workers=5,\n",
+    ")"
    ]
   },
   {
@@ -522,11 +370,11 @@
     "\n",
     "# Number of random training examples to choose form each class.\n",
     "# Set exactly one of train_ratio and train_examples_per_class\n",
-    "train_ratio = None  #@param\n",
-    "train_examples_per_class = 2  #@param\n",
+    "train_ratio = 0.9  #@param\n",
+    "train_examples_per_class = None  #@param\n",
     "\n",
     "# Number of random re-trainings. Allows judging model stability.\n",
-    "num_seeds = 1  #@param\n",
+    "num_seeds = 8  #@param\n",
     "\n",
     "# Classifier training hyperparams.\n",
     "# These should be good defaults.\n",
@@ -581,7 +429,7 @@
     "#@title Run model on target unlabeled data. { vertical-output: true }\n",
     "\n",
     "# Choose the target class to work with.\n",
-    "target_class = 'some_audio'  #@param\n",
+    "target_class = 'my_class'  #@param\n",
     "# Choose a target logit; will display results close to the target.\n",
     "# Set to None to get the highest-logit examples.\n",
     "target_logit = None  #@param\n",
@@ -629,11 +477,17 @@
     "if 'unknown' not in merged.labels:\n",
     "  display_labels += ('unknown',)\n",
     "\n",
-    "display.display_search_results(\n",
-    "    results, project_state.embedding_model.sample_rate,\n",
-    "    project_state.source_map,\n",
+    "samples_per_page = 10\n",
+    "page_state = display.PageState(\n",
+    "    np.ceil(len(results.search_results) / samples_per_page))\n",
+    "\n",
+    "display.display_paged_results(\n",
+    "    results, page_state, samples_per_page,\n",
+    "    embedding_sample_rate=project_state.embedding_model.sample_rate,\n",
+    "    source_map=project_state.source_map,\n",
+    "    exclusive_labels=False,\n",
     "    checkbox_labels=display_labels,\n",
-    "    max_workers=5)"
+    ")"
    ]
   },
   {
@@ -651,6 +505,27 @@
     "    project_state.embedding_model.sample_rate)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "id": "ZxasEcnhd7kP"
+   },
+   "outputs": [],
+   "source": [
+    "#@title Save the Custom Classifier. { vertical-output: true }\n",
+    "\n",
+    "wrapped_model = interface.LogitsOutputHead(\n",
+    "    model_path=custom_classifier_path.as_posix(),\n",
+    "    logits_key='logits',\n",
+    "    logits_model=model,\n",
+    "    class_list=namespace.ClassList('custom', merged.labels),\n",
+    ")\n",
+    "wrapped_model.save_model(\n",
+    "    custom_classifier_path,\n",
+    "    embeddings_path)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -670,54 +545,38 @@
    "source": [
     "#@title Write classifier inference CSV. { vertical-output: true }\n",
     "\n",
-    "threshold = 1.0  #@param\n",
     "output_filepath = '/tmp/inference.csv'  #@param\n",
     "\n",
+    "# Set detection thresholds.\n",
+    "default_threshold = 0.0  #@param\n",
+    "if default_threshold is None:\n",
+    "  # In this case, all logits are written. This can lead to very large CSV files.\n",
+    "  class_thresholds = None\n",
+    "else:\n",
+    "  class_thresholds = collections.defaultdict(lambda: default_threshold)\n",
+    "  # Set per-class thresholds here.\n",
+    "  class_thresholds['my_class'] = 1.0\n",
+    "\n",
+    "exclude_classes = ['unknown']  #@param\n",
+    "\n",
+    "# include_classes is ignored if empty.\n",
+    "# If non-empty, only scores for these classes will be written.\n",
+    "include_classes = []  #@param\n",
+    "\n",
     "# Create the embeddings dataset.\n",
     "embeddings_ds = tf_examples.create_embeddings_dataset(\n",
     "    embeddings_path, file_glob='embeddings-*')\n",
     "\n",
-    "def classify_batch(batch):\n",
-    "  \"\"\"Classify a batch of embeddings.\"\"\"\n",
-    "  emb = batch[tf_examples.EMBEDDING]\n",
-    "  emb_shape = tf.shape(emb)\n",
-    "  flat_emb = tf.reshape(emb, [-1, emb_shape[-1]])\n",
-    "  logits = model(flat_emb)\n",
-    "  logits = tf.reshape(\n",
-    "      logits, [emb_shape[0], emb_shape[1], tf.shape(logits)[-1]])\n",
-    "  # Take the maximum logit over channels.\n",
-    "  logits = tf.reduce_max(logits, axis=-2)\n",
-    "  batch['logits'] = logits\n",
-    "  return batch\n",
-    "\n",
-    "inference_ds = tf_examples.create_embeddings_dataset(\n",
-    "    embeddings_path, file_glob='embeddings-*')\n",
-    "inference_ds = inference_ds.map(\n",
-    "    classify_batch, num_parallel_calls=tf.data.AUTOTUNE\n",
-    ")\n",
-    "\n",
-    "with open(output_filepath, 'w') as f:\n",
-    "  # Write column headers.\n",
-    "  headers = ['filename', 'timestamp_s', 'label', 'logit']\n",
-    "  f.write(', '.join(headers) + '\\n')\n",
-    "  for ex in tqdm.tqdm(inference_ds.as_numpy_iterator()):\n",
-    "    for t in range(ex['logits'].shape[0]):\n",
-    "      for i, label in enumerate(merged.labels):\n",
-    "        if ex['logits'][t, i] > threshold:\n",
-    "          offset = ex['timestamp_s'] + t * bootstrap_config.embedding_hop_size_s\n",
-    "          logit = '{:.2f}'.format(ex['logits'][t, i])\n",
-    "          row = [ex['filename'].decode('utf-8'),\n",
-    "                 '{:.2f}'.format(offset),\n",
-    "                 label, logit]\n",
-    "          f.write(', '.join(row) + '\\n')\n"
+    "classify.write_inference_csv(\n",
+    "    embeddings_ds=embeddings_ds,\n",
+    "    model=model,\n",
+    "    labels=merged.labels,\n",
+    "    output_filepath=output_filepath,\n",
+    "    threshold=class_thresholds,\n",
+    "    embedding_hop_size_s=bootstrap_config.embedding_hop_size_s,\n",
+    "    include_classes=include_classes,\n",
+    "    exclude_classes=exclude_classes)\n"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "HSqxSk74EIgs"
-   },
-   "source": []
   }
  ],
  "metadata": {