diff --git a/README.md b/README.md index 7ff36f6..c202236 100644 --- a/README.md +++ b/README.md @@ -8,13 +8,11 @@ ![Python](https://img.shields.io/badge/python->=3.7-blue?logo=python) +## Overview -## Usage - - To use this template, click the green `Use this template` button and `Create new repository`. - - After github initially creates the new repository, please wait an extra minute for the initialization scripts to finish organizing the repo. - - To enable the automatic semantic version increments: in the repository go to `Settings` and `Collaborators and teams`. Click the green `Add people` button. Add `svc-aindscicomp` as an admin. Modify the file in `.github/workflows/tag_and_publish.yml` and remove the if statement in line 10. The semantic version will now be incremented every time a code is committed into the main branch. - - To publish to PyPI, enable semantic versioning and uncomment the publish block in `.github/workflows/tag_and_publish.yml`. The code will now be published to PyPI every time the code is committed into the main branch. - - The `.github/workflows/test_and_lint.yml` file will run automated tests and style checks every time a Pull Request is opened. If the checks are undesired, the `test_and_lint.yml` can be deleted. The strictness of the code coverage level, etc., can be modified by altering the configurations in the `pyproject.toml` file and the `.flake8` file. +This repository contains scripts that run after the completion of extracellular electrophysiology experiments. It serves two main purposes: +- Ensuring that all data streams are aligned to a common timebase prior to upload +- Generating a QC report that can be used to validate ephys data quality ## Installation To use the software, in the root directory, run @@ -42,12 +40,12 @@ coverage run -m unittest discover && coverage report - Use **interrogate** to check that modules, methods, etc. have been documented thoroughly: ```bash -interrogate . +interrogate -v . ``` -- Use **flake8** to check that code is up to standards (no unused imports, etc.): +- Use **isort** to automatically sort import statements: ```bash -flake8 . +isort . ``` - Use **black** to automatically format the code into PEP standards: @@ -55,11 +53,13 @@ flake8 . black . ``` -- Use **isort** to automatically sort import statements: +- Use **flake8** to check that code is up to standards (no unused imports, etc.): ```bash -isort . +flake8 . ``` + + ### Pull requests For internal members, please create a branch. For external members, please fork the repository and open a pull request from the fork. We'll primarily use [Angular](https://github.com/angular/angular/blob/main/CONTRIBUTING.md#commit) style for commit messages. Roughly, they should follow the pattern: diff --git a/doc_template/source/conf.py b/doc_template/source/conf.py index afc260a..6d52a0d 100644 --- a/doc_template/source/conf.py +++ b/doc_template/source/conf.py @@ -1,12 +1,15 @@ """Configuration file for the Sphinx documentation builder.""" + # # For the full list of built-in configuration values, see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html +from datetime import date + # -- Path Setup -------------------------------------------------------------- -from os.path import dirname, abspath +from os.path import abspath, dirname from pathlib import Path -from datetime import date + from aind_ephys_rig_qc import __version__ as package_version INSTITUTE_NAME = "Allen Institute for Neural Dynamics" diff --git a/pyproject.toml b/pyproject.toml index 73c3124..b460011 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,11 @@ readme = "README.md" dynamic = ["version"] dependencies = [ + 'open-ephys-python-tools', + 'matplotlib', + 'fpdf2', + 'scipy', + 'rich' ] [project.optional-dependencies] diff --git a/src/aind_ephys_rig_qc/__init__.py b/src/aind_ephys_rig_qc/__init__.py index d0a8547..024d51f 100644 --- a/src/aind_ephys_rig_qc/__init__.py +++ b/src/aind_ephys_rig_qc/__init__.py @@ -1,2 +1,3 @@ """Init package""" + __version__ = "0.0.0" diff --git a/src/aind_ephys_rig_qc/generate_report.py b/src/aind_ephys_rig_qc/generate_report.py new file mode 100644 index 0000000..7c6c7d2 --- /dev/null +++ b/src/aind_ephys_rig_qc/generate_report.py @@ -0,0 +1,257 @@ +""" +Generates a PDF report from an Open Ephys data directory +""" + +import json +import os +import sys + +import numpy as np +import pandas as pd +from open_ephys.analysis import Session + +from aind_ephys_rig_qc import __version__ as package_version +from aind_ephys_rig_qc.pdf_utils import PdfReport +from aind_ephys_rig_qc.qc_figures import plot_power_spectrum, plot_raw_data + + +def generate_qc_report(directory, report_name="QC.pdf"): + """ + Generates a PDF report from an Open Ephys data directory + + Saves QC.pdf + + Parameters + ---------- + directory : str + The path to the Open Ephys data directory + report_name : str + The name of the PDF report + + """ + + pdf = PdfReport("aind-ephys-rig-qc v" + package_version) + pdf.add_page() + pdf.set_font("Helvetica", "B", size=12) + pdf.set_y(30) + pdf.write(h=12, text=f"Overview of recordings in {directory}") + + pdf.set_font("Helvetica", "", size=10) + pdf.set_y(45) + pdf.embed_table(get_stream_info(directory), width=pdf.epw) + + create_qc_plots(pdf, directory) + + pdf.output(os.path.join(directory, report_name)) + + +def get_stream_info(directory): + """ + Get information about the streams in an Open Ephys data directory + + Parameters + ---------- + directory : str + The path to the Open Ephys data directory + + Returns + ------- + pd.DataFrame + A DataFrame with information about the streams + + """ + + session = Session(directory) + + stream_info = { + "Record Node": [], + "Rec Idx": [], + "Exp Idx": [], + "Stream": [], + "Duration (s)": [], + "Channels": [], + } + + for recordnode in session.recordnodes: + + current_record_node = os.path.basename(recordnode.directory).split( + "Record Node " + )[1] + + for recording in recordnode.recordings: + + current_experiment_index = recording.experiment_index + current_recording_index = recording.recording_index + + for stream in recording.continuous: + + sample_rate = stream.metadata["sample_rate"] + data_shape = stream.samples.shape + channel_count = data_shape[1] + duration = data_shape[0] / sample_rate + + stream_info["Record Node"].append(current_record_node) + stream_info["Rec Idx"].append(current_recording_index) + stream_info["Exp Idx"].append(current_experiment_index) + stream_info["Stream"].append(stream.metadata["stream_name"]) + stream_info["Duration (s)"].append(duration) + stream_info["Channels"].append(channel_count) + + return pd.DataFrame(data=stream_info) + + +def get_event_info(events, stream_name): + """ + Get information about the events in a given stream + + Parameters + ---------- + events : pd.DataFrame + A DataFrame with information about the events + stream_name : str + The name of the stream to query + + Returns + ------- + pd.DataFrame + A DataFrame with information about events for one stream + + """ + event_info = { + "Line": [], + "First Time (s)": [], + "Last Time (s)": [], + "Event Count": [], + "Event Rate (Hz)": [], + } + + events_for_stream = events[events.stream_name == stream_name] + + for line in events_for_stream.line.unique(): + events_for_line = events_for_stream[ + (events_for_stream.line == line) & (events_for_stream.state == 1) + ] + + frequency = np.mean(np.diff(events_for_line.timestamp)) + first_time = events_for_line.iloc[0].timestamp + last_time = events_for_line.iloc[-1].timestamp + + event_info["Line"].append(line) + event_info["First Time (s)"].append(round(first_time, 2)) + event_info["Last Time (s)"].append(round(last_time, 2)) + event_info["Event Count"].append(events_for_line.shape[0]) + event_info["Event Rate (Hz)"].append(round(frequency, 2)) + + return pd.DataFrame(data=event_info) + + +def create_qc_plots(pdf, directory): + """ + Create QC plots for an Open Ephys data directory + """ + + session = Session(directory) + + for recordnode in session.recordnodes: + + current_record_node = os.path.basename(recordnode.directory).split( + "Record Node " + )[1] + + for recording in recordnode.recordings: + + current_experiment_index = recording.experiment_index + current_recording_index = recording.recording_index + + events = recording.events + + for stream in recording.continuous: + + duration = ( + stream.samples.shape[0] / stream.metadata["sample_rate"] + ) + + stream_name = stream.metadata["stream_name"] + + pdf.add_page() + pdf.set_font("Helvetica", "B", size=12) + pdf.set_y(30) + pdf.write(h=12, text=f"{stream_name}") + pdf.set_font("Helvetica", "", size=10) + pdf.set_y(40) + pdf.write(h=10, text=f"Record Node: {current_record_node}") + pdf.set_y(45) + pdf.write( + h=10, + text=f"Recording Index: " f"{current_recording_index}", + ) + pdf.set_y(50) + pdf.write( + h=10, + text=f"Experiment Index: " f"{current_experiment_index}", + ) + pdf.set_y(55) + pdf.write( + h=10, + text=f"Source Node: " + f"{stream.metadata['source_node_name']}" + f" ({stream.metadata['source_node_id']})", + ) + pdf.set_y(60) + pdf.write(h=10, text=f"Duration: {duration} s") + pdf.set_y(65) + pdf.write( + h=10, + text=f"Sample Rate: " + f"{stream.metadata['sample_rate']} Hz", + ) + pdf.set_y(70) + pdf.write(h=10, text=f"Channels: {stream.samples.shape[1]}") + + df = get_event_info(events, stream_name) + + pdf.set_y(80) + pdf.set_font("Helvetica", "B", size=11) + pdf.write(h=12, text="Event info") + pdf.set_y(90) + pdf.set_font("Helvetica", "", size=10) + pdf.embed_table(df, width=pdf.epw) + + pdf.set_y(120) + pdf.embed_figure( + plot_raw_data( + stream.samples, + stream.metadata["sample_rate"], + stream_name, + ) + ) + + pdf.set_y(200) + pdf.embed_figure( + plot_power_spectrum( + stream.samples, + stream.metadata["sample_rate"], + stream_name, + ) + ) + + +if __name__ == "__main__": + + if len(sys.argv) != 3: + print("Two input arguments are required:") + print(" 1. A data directory") + print(" 2. A JSON parameters file") + else: + with open( + sys.argv[2], + "r", + ) as f: + parameters = json.load(f) + + directory = sys.argv[1] + + if not os.path.exists(directory): + raise ValueError(f"Data directory {directory} does not exist.") + + generate_qc_report(directory, **parameters) diff --git a/src/aind_ephys_rig_qc/images/aind-logo.png b/src/aind_ephys_rig_qc/images/aind-logo.png new file mode 100644 index 0000000..04c9a52 Binary files /dev/null and b/src/aind_ephys_rig_qc/images/aind-logo.png differ diff --git a/src/aind_ephys_rig_qc/parameters.json b/src/aind_ephys_rig_qc/parameters.json new file mode 100644 index 0000000..98376b4 --- /dev/null +++ b/src/aind_ephys_rig_qc/parameters.json @@ -0,0 +1,5 @@ +{ + "report_name" : "QC.pdf", + "align_timestamps_to" : "local", + "original_timestamp_filename" : "original_timestamps.npy" +} \ No newline at end of file diff --git a/src/aind_ephys_rig_qc/pdf_utils.py b/src/aind_ephys_rig_qc/pdf_utils.py new file mode 100644 index 0000000..ecf67ac --- /dev/null +++ b/src/aind_ephys_rig_qc/pdf_utils.py @@ -0,0 +1,106 @@ +""" +Helper class for generating PDF reports (using the fpdf2 library) +""" + +import os +import platform + +import matplotlib.pyplot as plt +import numpy as np +from fpdf import FPDF +from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas +from PIL import Image + + +class PdfReport(FPDF): + """ + Sub-class of FPDF with additional methods for embedding images and tables + """ + + def __init__(self, title="Running title"): + """ + Initialize the PDF report with a title + """ + super().__init__() + self.title = title + self.set_matplotlib_defaults() + + def header(self): + """ + Add a header with the Neural Dynamics logo and a running title + """ + self.image( + os.path.join(os.path.dirname(__file__), "images", "aind-logo.png"), + x=150, + y=10, + w=50, + ) + self.set_font("courier", "", 9) + self.cell(30, 15, self.title, align="L") + self.ln(20) + + def footer(self): + """ + Add a footer with the page number + """ + self.set_y(-15) + self.set_font("helvetica", "I", 8) + self.cell(0, 10, f"Page {self.page_no()}/{{nb}}", align="C") + + def set_matplotlib_defaults(self): + """ + Set the default matplotlib parameters for the report + """ + plt.rcParams["font.family"] = "sans-serif" + + if platform.system() == "Linux": + plt.rcParams["font.sans-serif"] = ["Nimbus Sans"] + else: + plt.rcParams["font.sans-serif"] = ["Arial"] + + def embed_figure(self, fig, width=190): + """ + Convert a matplotlib figure to an image and embed it in the PDF + + Parameters + ---------- + fig : matplotlib.figure.Figure + The figure to embed + width : int + The width of the image in the PDF + """ + + canvas = FigureCanvas(fig) + canvas.draw() + img = Image.fromarray(np.asarray(canvas.buffer_rgba())) + self.image(img, w=width) + + def embed_table(self, df, width=190): + """ + Create a table out of a pandas DataFrame and embed it in the PDF + + Parameters + ---------- + df : pandas.DataFrame + The table to embed + width : int + The width of the image in the PDF + """ + + DF = df.map(str) # convert all elements to string + DATA = [ + list(DF) + ] + DF.values.tolist() # Combine columns and rows in one list + + with self.table( + borders_layout="SINGLE_TOP_LINE", + cell_fill_color=240, + cell_fill_mode="ROWS", + line_height=self.font_size * 2, + text_align="CENTER", + width=width, + ) as table: + for data_row in DATA: + row = table.row() + for datum in data_row: + row.cell(datum) diff --git a/src/aind_ephys_rig_qc/qc_figures.py b/src/aind_ephys_rig_qc/qc_figures.py new file mode 100644 index 0000000..d3b7fb2 --- /dev/null +++ b/src/aind_ephys_rig_qc/qc_figures.py @@ -0,0 +1,71 @@ +""" +Generates figures for checking ephys data quality +""" + +from matplotlib.figure import Figure +from scipy.signal import welch + + +def plot_raw_data(data, sample_rate, stream_name): + """ + Plot a snippet of raw data as an image + + Parameters + ---------- + data : np.ndarray + The data to plot (samples x channels) + sample_rate : float + The sample rate of the data + stream_name : str + The name of the stream + + Returns + ------- + matplotlib.figure.Figure + Figure object containing the plot + """ + + fig = Figure(figsize=(10, 4)) + ax = fig.subplots() + + ax.imshow(data[:1000, :].T, aspect="auto", cmap="RdBu") + ax.set_title(f"{stream_name} Raw Data") + ax.set_xlabel("Samples") + ax.set_ylabel("Channels") + + return fig + + +def plot_power_spectrum(data, sample_rate, stream_name): + """ + Plot the power spectrum of the data + + Parameters + ---------- + data : np.ndarray + The data to plot (samples x channels) + sample_rate : float + The sample rate of the data + stream_name : str + The name of the stream + + Returns + ------- + matplotlib.figure.Figure + Figure object containing the plot + """ + + fig = Figure(figsize=(10, 4)) + ax = fig.subplots() + + subset = data[:1000, :] + + for i in range(subset.shape[1]): + f, p = welch(subset[:, i], fs=sample_rate) + ax.plot(f, p) + + ax.set_title(f"{stream_name} PSD") + ax.set_xlabel("Frequency") + ax.set_ylabel("Power") + + return fig diff --git a/src/aind_ephys_rig_qc/temporal_alignment.py b/src/aind_ephys_rig_qc/temporal_alignment.py new file mode 100644 index 0000000..fa4ef1d --- /dev/null +++ b/src/aind_ephys_rig_qc/temporal_alignment.py @@ -0,0 +1,288 @@ +""" +Aligns timestamps across multiple data streams +""" + +import json +import os +import sys + +import numpy as np + + +def align_timestamps( + directory, + align_timestamps_to="local", + original_timestamp_filename="original_timestamps.npy", +): + """ + Aligns timestamps across multiple streams + + Parameters + ---------- + directory : str + The path to the Open Ephys data directory + align_timestamps_to : str + The type of alignment to perform + Option 1: 'local' (default) + Option 2: 'harp' (extract Harp timestamps from the NIDAQ stream) + original_timestamp_filename : str + The name of the file for archiving the original timestamps + """ + + return None + + +def get_num_barcodes(harp_events, delta_time=0.5): + """ + Returns the number of barcodes + + Parameter + --------- + harp_events : pd.DataFrame + Events dataframe from open ephys tools + delta_time : float + The time difference between barcodes + + Returns + ------- + numbarcodes : int + The number of barcodes in the recordings + """ + (splits,) = np.where(np.diff(harp_events.timestamp) > delta_time) + return len(splits) - 1 + + +def get_barcode(harp_events, index, delta_time=0.5): + """ + Returns a subset of original DataFrame corresponding to a specific + barcode + + Parameter + --------- + harp_events : pd.DataFrame + Events dataframe from open ephys tools + index : int + The index of the barcode being requested + delta_time : float + The time difference between barcodes + + Returns + ------- + sample_numbers : np.array + Array of integer sample numbers for each barcode event + states : np.array + Array of states (1 or 0) for each barcode event + + """ + (splits,) = np.where(np.diff(harp_events.timestamp) > delta_time) + + barcode = harp_events.iloc[splits[index] + 1:splits[index + 1] + 1] + + return barcode.sample_number.values, barcode.state.values + + +def convert_barcode_to_time( + sample_numbers, states, baud_rate=1000.0, sample_rate=30000.0 +): + """ + Converts event sample numbers and states to + a Harp timestamp in seconds. + + Harp timestamp is encoded as 32 bits, with + the least significant bit coming first, and 2 bits + between each byte. + """ + + samples_per_bit = int(sample_rate / baud_rate) + middle_sample = int(samples_per_bit / 2) + + intervals = np.diff(sample_numbers) + + barcode = np.concatenate( + [ + np.ones((count,)) * state + for state, count in zip(states[:-1], intervals) + ] + ).astype("int") + + val = np.concatenate( + [ + np.arange( + samples_per_bit + middle_sample + samples_per_bit * 10 * i, + samples_per_bit * 10 * i + - middle_sample + + samples_per_bit * 10, + samples_per_bit, + ) + for i in range(4) + ] + ) + s = np.flip(barcode[val]) + harp_time = s.dot(2 ** np.arange(s.size)[::-1]) + + return harp_time + + +def rescale_times_linear(times_ephys, harp_events, delta_time=0.5): + """ + Applies a linear rescaling to the ephys timestamps + based on the HARP timestamps. + + Parameters + ---------- + times_ephys : np.array + Array with the ephys timestamps + harp_events : pd.DataFrame + Events dataframe from open ephys tools + delta_time : float + The time difference between barcodes + + Returns + ------- + new_times : np.array + Rescaled ephys timestamps + """ + splits = np.where(np.diff(harp_events.timestamp) > delta_time)[0] + last_index = len(splits) - 2 + + t1_ephys = ( + harp_events.iloc[splits[0] + 1:splits[1] + 1].iloc[0].timestamp + ) + t2_ephys = ( + harp_events.iloc[splits[last_index] + 1:splits[last_index + 1] + 1] + .iloc[0] + .timestamp + ) + + sample_numbers, states = get_barcode(harp_events, 0) + t1_harp = convert_barcode_to_time(sample_numbers, states) + sample_numbers, states = get_barcode(harp_events, last_index) + t2_harp = convert_barcode_to_time(sample_numbers, states) + + new_times = np.copy(times_ephys) + scaling = (t2_harp - t1_harp) / (t2_ephys - t1_ephys) + new_times -= t1_ephys + new_times *= scaling + new_times += t1_harp + + return new_times + + +def compute_ephys_harp_times( + times_ephys, + harp_events, + fs=30_000, + delta_time=0.5, + wrong_decoded_delta_time=2, +): + """ + Computes ephys timestamps assuming that they are uniformly samples + between barcodes. The times_ephys are only used to get the + number of samples. + + Parameters + ---------- + times_ephys : np.array + Array with the ephys timestamps + harp_events : pd.DataFrame + Events dataframe from open ephys tools + delta_time : float + The time difference between barcodes + wrong_decoded_delta_time : float + The time difference threshold between barcodes to detect a wrong + decoding and fit the barcode time + + Returns + ------- + new_times : np.array + Rescaled ephys timestamps + """ + sampling_period = 1 / fs + + # compute all barcode times + num_harp_events = get_num_barcodes(harp_events, delta_time=delta_time) + timestamps_harp = np.zeros(num_harp_events, dtype="float64") + for i in range(num_harp_events): + sample_numbers, states = get_barcode( + harp_events, i, delta_time=delta_time + ) + barcode_harp_time = convert_barcode_to_time(sample_numbers, states) + timestamps_harp[i] = barcode_harp_time + + # fix any wrong decoding + (wrong_decoded_idxs,) = np.where( + (np.diff(timestamps_harp)) > wrong_decoded_delta_time + ) + print(f"Found {len(wrong_decoded_idxs)} badly aligned timestamps") + timestamps_harp[wrong_decoded_idxs + 1] + for idx in wrong_decoded_idxs + 1: + new_ts = ( + timestamps_harp[idx - 1] + + (timestamps_harp[idx + 1] - timestamps_harp[idx - 1]) / 2 + ) + timestamps_harp[idx] = new_ts + + # get indices of harp clock in ephys timestamps + (splits,) = np.where(np.diff(harp_events.timestamp) > delta_time) + harp_clock_indices = np.searchsorted( + times_ephys, harp_events.timestamp.values[splits[:-1] + 1] + ) + + # compute new ephys times + times_ephys_aligned = np.zeros_like(times_ephys, dtype="float64") + + # here we use the BARCODE as a metronome and assume that ephys + # is uniformly sampled in between HARP beats + for i, (t_harp, harp_idx) in enumerate( + zip(timestamps_harp, harp_clock_indices) + ): + if i == 0: + first_sample = 0 + num_samples = harp_idx + # fill in other chunks + sample_indices = np.arange(num_samples) + t_start = t_harp - len(sample_indices) / fs + else: + first_sample = harp_clock_indices[i - 1] + num_samples = harp_idx - harp_clock_indices[i - 1] + t_start = timestamps_harp[i - 1] + + # fill in other chunks + sample_indices = np.arange(num_samples) + if i != 0: + t_start = timestamps_harp[i - 1] + else: + t_start = t_harp - len(sample_indices) / fs + t_stop = t_harp - sampling_period + times_ephys_aligned[first_sample:harp_idx] = np.linspace( + t_start, t_stop, num_samples + ) + + # take care of last chunk + num_samples = len(times_ephys_aligned) - harp_idx + t_start = times_ephys_aligned[harp_idx - 1] + sampling_period + t_stop = t_start + num_samples / fs + times_ephys_aligned[harp_idx:] = np.linspace(t_start, t_stop, num_samples) + + return times_ephys_aligned + + +if __name__ == "__main__": + + if len(sys.argv) != 3: + print("Two input arguments are required:") + print(" 1. A data directory") + print(" 2. A JSON parameters file") + else: + with open( + sys.argv[2], + "r", + ) as f: + parameters = json.load(f) + + directory = sys.argv[1] + + if not os.path.exists(directory): + raise ValueError(f"Data directory {directory} does not exist.") + + align_timestamps(directory, **parameters) diff --git a/tests/test_example.py b/tests/test_example.py deleted file mode 100644 index 06e9e0d..0000000 --- a/tests/test_example.py +++ /dev/null @@ -1,16 +0,0 @@ -"""Example test template.""" - -import unittest - - -class ExampleTest(unittest.TestCase): - """Example Test Class""" - - def test_assert_example(self): - """Example of how to test the truth of a statement.""" - - self.assertTrue(1 == 1) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_generate_report.py b/tests/test_generate_report.py new file mode 100644 index 0000000..42e5c70 --- /dev/null +++ b/tests/test_generate_report.py @@ -0,0 +1,20 @@ +"""Example test template.""" + +import os +import unittest + +from aind_ephys_rig_qc.generate_report import generate_qc_report + + +class TestGenerateReport(unittest.TestCase): + """Example Test Class""" + + def test_generate_report(self): + """Example of how to test the truth of a statement.""" + + generate_qc_report("", "qc.pdf") + self.assertTrue(os.path.exists("qc.pdf")) + + +if __name__ == "__main__": + unittest.main()