Merge branch 'develop' into hackdna/citation

onnela-lab · Oct 25, 2024 · 6ff68f6 · 6ff68f6
2 parents 037221c + 9989247
commit 6ff68f6
Show file tree

Hide file tree

Showing 18 changed files with 51 additions and 43 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -11,14 +11,14 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: ['ubuntu-22.04', 'windows-2022', 'macos-12']
+        os: ['ubuntu-24.04', 'windows-2022', 'macos-14']
     steps:
       - name: Check out Forest code
         uses: actions/checkout@v4
       - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
-          python-version: 3.8
+          python-version: 3.11
       - name: Install Forest dependencies for Linux
         # required by librosa
         if: ${{ startsWith(matrix.os, 'ubuntu') }}
@@ -28,7 +28,7 @@ jobs:
       - name: Install Forest dependencies for Windows
         # required by librosa
         if: ${{ startsWith(matrix.os, 'windows') }}
-        uses: FedericoCarboni/setup-ffmpeg@v2
+        uses: FedericoCarboni/setup-ffmpeg@v3
         id: setup-ffmpeg
       - name: Install Forest
         run: pip install -e .

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -7,7 +7,7 @@ on:
 jobs:
   build-html-docs:
     name: 'Build HTML docs'
-    runs-on: 'ubuntu-22.04'
+    runs-on: 'ubuntu-24.04'
     defaults:
       run:
         working-directory: './docs'
@@ -17,7 +17,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: 3.11
       - name: Install documentation build dependencies
         run: pip install -r requirements.txt
       - name: Build HTML docs

diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 __pycache__/
+.venv/
 .DS_Store
 
 # IntelliJ, VsCode project files
@@ -21,4 +22,4 @@ __pycache__/
 docs/_build/
 
 # any python environment files
-.python-version
+.python-version
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -9,7 +9,7 @@ version: 2
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.8"
+    python: "3.11"
 
 # Optionally declare the Python requirements required to build your docs
 python:

diff --git a/README.md b/README.md
@@ -4,13 +4,11 @@
 
 <img width="264" height="99" src="forest-logo-color.png" alt="Forest logo">
 
-# Forest (Python 3.8)
-
-The Onnela Lab at the Harvard T.H. Chan School of Public Health has developed the Forest library to analyze smartphone-based high-throughput digital phenotyping data. The main intellectual challenge in smartphone-based digital phenotyping has moved from data collection to data analysis. Our research focuses on the development of mathematical and statistical methods for analyzing intensive high-dimensional data. We are actively developing the Forest library for analyzing smartphone-based high-throughput digital phenotyping data collected with the [Beiwe](https://github.com/onnela-lab/beiwe-backend) platform. Forest will implement our methods for analyzing Beiwe data as a Python 3.8 package and is released under the BSD-3 open-source license. The Forest library will continue to grow over the coming years as we develop new analytical methods.
+The Onnela Lab at the Harvard T.H. Chan School of Public Health has developed the Forest library to analyze smartphone-based high-throughput digital phenotyping data. The main intellectual challenge in smartphone-based digital phenotyping has moved from data collection to data analysis. Our research focuses on the development of mathematical and statistical methods for analyzing intensive high-dimensional data. We are actively developing the Forest library for analyzing smartphone-based high-throughput digital phenotyping data collected with the [Beiwe](https://github.com/onnela-lab/beiwe-backend) platform. Forest will implement our methods for analyzing Beiwe data as a Python package and is released under the BSD-3 open-source license. The Forest library will continue to grow over the coming years as we develop new analytical methods.
 
 Forest can be run locally but is also integrated into the Beiwe back-end on AWS, consistent with the preferred big-data computing paradigm of moving computation to the data. Integrated with Beiwe, Forest can be used to generate on-demand analytics, most importantly daily or hourly summary statistics of collected data, which are stored in a relational database on AWS. The system also implements an API for Tableau, which supports the creation of customizable workbooks and dashboards to view data summaries and troubleshoot any issues with data collection. Tableau is commercial software but is available under free viewer licenses and may be free to academic users for the first year (see Tableau for more information).
 
-For more detailed info on specific subpackages, see our [Documentation](https://forest.beiwe.org). Please note that Forest uses Python 3.8.
+For more detailed info on specific subpackages, see our [Documentation](https://forest.beiwe.org). Please note that Forest uses Python 3.11.
 
 # Description
 

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,4 +1,4 @@
-myst-parser==0.17.2
-sphinx==4.5.0
+myst-parser==3.0.0
+sphinx==7.3.7
 sphinx-copybutton==0.5.0
-sphinx_rtd_theme==1.0.0
+sphinx_rtd_theme==2.0.0
diff --git a/docs/source/aws.md b/docs/source/aws.md
@@ -593,10 +593,10 @@ to that user.
 Assume we are ssh-ed to the EC2 instance.
 
 To create a new environment named `forest_main` with Python version
-`3.8` use
+`3.11` use
 
 ``` sh
-conda create --name forest_main python=3.8
+conda create --name forest_main python=3.11
 ```
 
 To activate an environment named `forest_main` use
@@ -678,12 +678,12 @@ base                  *  /opt/anaconda
 [Forest](https://github.com/onnela-lab/forest) is a Python library for
 analyzing smartphone-based high-throughput digital phenotyping data
 collected with the Beiwe platform. Forest implements methods as a Python
-3.8 package. Forest is integrated into the Beiwe back-end on AWS but can
+3.11 package. Forest is integrated into the Beiwe back-end on AWS but can
 also be run locally.
 
 Assume we are ssh-ed to the EC2 instance. Use the commands below to
 activate Anaconda environment of choice (here, `forest_main` that has
-Python `3.8` installed) and install `git`, `pip`.
+Python `3.11` installed) and install `git`, `pip`.
 
 ``` sh
 conda activate forest_main

diff --git a/docs/source/index.md b/docs/source/index.md
@@ -32,7 +32,7 @@ passive-data.md
 
 # Home
 
-Forest is a library for analyzing smartphone-based high-throughput digital phenotyping data collected with the [Beiwe platform](https://www.beiwe.org/). Forest implements methods as a Python 3.8 package. Forest is integrated into the Beiwe back-end on AWS but can also be run locally.
+Forest is a library for analyzing smartphone-based high-throughput digital phenotyping data collected with the [Beiwe platform](https://www.beiwe.org/). Forest implements methods as a Python 3.11 package. Forest is integrated into the Beiwe back-end on AWS but can also be run locally.
 
 **Table of Contents**
 ```{contents}

diff --git a/docs/source/logging.md b/docs/source/logging.md
@@ -358,4 +358,4 @@ a, b, c = wrapper(x, y, z, 'path/to/log/output/directory')
 
 * [The Python Standard Library's documentation for `logging`](https://docs.python.org/3/library/logging.html)
 * Vinay Sajip's [*Logging HOWTO*](https://docs.python.org/3/howto/logging.html)
-* [`LogRecord` attributes](https://docs.python.org/3.8/library/logging.html?highlight=logging#logrecord-attributes)
+* [`LogRecord` attributes](https://docs.python.org/3.11/library/logging.html#logrecord-attributes)
diff --git a/forest/jasmine/traj2stats.py b/forest/jasmine/traj2stats.py
@@ -1821,6 +1821,6 @@ def gps_stats_generate_summary(
     if parameters.save_osm_log:
         with open(
             f"{logs_folder}/locations_logs_{frequency.name.lower()}.json",
-            "wa",
+            "a",
         ) as loc:
             json.dump(logs, loc, indent=4)
diff --git a/forest/oak/base.py b/forest/oak/base.py
@@ -184,8 +184,12 @@ def compute_interpolate_cwt(tapered_bout: np.ndarray, fs: int = 10,
     # interpolate coefficients
     freqs = out[2]
     freqs_interp = np.arange(0.5, 4.5, 0.05)
-    ip = interpolate.interp2d(range(coefs.shape[1]), freqs, coefs)
-    coefs_interp = ip(range(coefs.shape[1]), freqs_interp)
+    interpolator = interpolate.RegularGridInterpolator(
+        (freqs, range(coefs.shape[1])), coefs
+    )
+    grid_x, grid_y = np.meshgrid(freqs_interp, range(coefs.shape[1]),
+                                 indexing='ij')
+    coefs_interp = interpolator((grid_x, grid_y))
 
     # trim spectrogram from the coi
     coefs_interp = coefs_interp[:, 5*fs:-5*fs]
@@ -524,7 +528,7 @@ def run_hourly(
         cadence_temp = cadence_bout[t_hours_pd == t_unique]
         cadence_temp = cadence_temp[cadence_temp > 0]
         # store hourly metrics
-        if math.isnan(steps_hourly[ind_to_store]):
+        if math.isnan(steps_hourly[ind_to_store].item()):
             steps_hourly[ind_to_store] = int(np.sum(cadence_temp))
             walkingtime_hourly[ind_to_store] = len(cadence_temp)
         else:
@@ -609,11 +613,11 @@ def run(study_folder: str, output_folder: str, tz_str: Optional[str] = None,
                 frequency == Frequency.HOURLY_AND_DAILY
                 or frequency == Frequency.HOURLY
             ):
-                freq = 'H'
+                freq = 'h'
             elif frequency == Frequency.MINUTE:
-                freq = 'T'
+                freq = 'min'
             else:
-                freq = str(frequency.value/60) + 'H'
+                freq = str(frequency.value/60) + 'h'
 
             days_hourly = pd.date_range(date_start, date_end+timedelta(days=1),
                                         freq=freq)[:-1]

diff --git a/forest/oak/tests/test_run_hourly.py b/forest/oak/tests/test_run_hourly.py
@@ -23,7 +23,7 @@ def sample_run_input(signal_bout):
     t_ind_pydate = pd.date_range(
         start='2020-02-24 00:00:00',
         end='2020-02-25 23:00:00',
-        freq='H',
+        freq='h',
         tz='US/Eastern'
     ).to_pydatetime()
     cadence_bout = np.array(

diff --git a/forest/poplar/functions/log.py b/forest/poplar/functions/log.py
@@ -15,7 +15,7 @@
 
 # Dictionary of available log record attributes:
 # For details, see:
-#   https://docs.python.org/3.8/library/logging.html?highlight=logging#logrecord-attributes
+# https://docs.python.org/3.11/library/logging.html#logrecord-attributes
 AVAILABLE_ATTRIBUTES = {
     "asctime,msecs": "%(asctime)s",  # Human-readable time with milliseconds.
     "created": "%(created)f",  # Unix timestamp (seconds since epoch).
@@ -134,7 +134,7 @@ def log_to_csv(
         log_name (str): Name for the log file.
         log_format (str): The format argument for logging.basicConfig.
             For available attributes and formatting instructions, see:
-            https://docs.python.org/3.8/library/logging.html?highlight=logging#logrecord-attributes)
+            https://docs.python.org/3.11/library/logging.html#logrecord-attributes
         header (list): Header for the csv.
 
     Returns:

diff --git a/forest/sycamore/common.py b/forest/sycamore/common.py
@@ -628,14 +628,19 @@ def find_missing_data(user: str, survey_id: str, agg_data: pd.DataFrame,
     ].unique()
     missing_times = []
     for time in known_answers_submits:
+        # If there were no timings submits recorded, every answers
+        # submit will be missing
+        if len(known_timings_submits) == 0:
+            missing_times.append(time)
+            continue
 
         hours_from_nearest = np.min(
             np.abs((pd.to_datetime(known_timings_submits)
                     - pd.to_datetime(time)).total_seconds())
         ) / 60 / 60
         # add on the data if there is more than 1/2 hour between an
         # answers submission and a timing submission.
-        if hours_from_nearest > .5 or len(known_timings_submits) == 0:
+        if hours_from_nearest > .5:
             missing_times.append(time)
     if len(missing_times) > 0:
         missing_data = answers_data.loc[

diff --git a/forest/sycamore/responses.py b/forest/sycamore/responses.py
@@ -143,7 +143,7 @@ def agg_changed_answers_summary(
     detail["time_to_answer"] = np.where(
         detail["data_stream"] == "survey_timings",
         detail["time_to_answer"],
-        np.NaN
+        np.nan
     )
 
     #####################################################################
@@ -251,7 +251,7 @@ def format_responses_by_submission(agg_data: pd.DataFrame) -> dict:
             survey_df["survey_duration"] = np.where(
                 survey_df["data_stream"] == "survey_timings",
                 survey_df["survey_duration"],
-                np.NaN
+                np.nan
             )
 
             keep_cols = ["beiwe_id", "start_time", "end_time",

diff --git a/forest/sycamore/submits.py b/forest/sycamore/submits.py
@@ -470,12 +470,12 @@ def survey_submits(
     submit_lines3["time_to_submit"] = np.where(
         submit_lines3["submit_flg"] == 1,
         submit_lines3["time_to_submit"],
-        np.NaN
+        np.nan
     )
     submit_lines3["time_to_open"] = np.where(
         submit_lines3["opened_flg"] == 1,
         submit_lines3["time_to_open"],
-        np.NaN
+        np.nan
     )
     return submit_lines3.sort_values(["survey id", "beiwe_id"]
                                      ).drop_duplicates()

diff --git a/mypy.ini b/mypy.ini
@@ -1,5 +1,5 @@
 [mypy]
-python_version = 3.8
+python_version = 3.11
 
 [mypy-holidays]
 ignore_missing_imports = True

diff --git a/tutorials/forest_usage.ipynb b/tutorials/forest_usage.ipynb
@@ -30,7 +30,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Before we begin, we need to check the current distribution of Python. Note that forest is built using Python 3.8. "
+    "Before we begin, we need to check the current distribution of Python. Note that forest is built using Python 3.11. "
    ]
   },
   {
@@ -74,7 +74,7 @@
    "source": [
     "*The output should display two lines.* \n",
     "\n",
-    "1. The Python version installed- make sure you are not using a version of Python that is earlier than 3.8\n",
+    "1. The Python version installed: make sure you are not using a version of Python that is earlier than 3.11\n",
     "2. The path to where Python is currently installed"
    ]
   },
@@ -709,10 +709,10 @@
      "evalue": "name 'response_data' is not defined",
      "output_type": "error",
      "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[0;32m/var/folders/nl/92kzg8c56mn1872898r7rjr40000gn/T/ipykernel_96458/1316156180.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m## Make sure the data is sorted according to date\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mresponse_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Date'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0mresponse_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdrop\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mtime_series_plot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvar_to_plot\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mylab\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxlab\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'Date'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_x_ticks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mNameError\u001b[0m: name 'response_data' is not defined"
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mNameError\u001B[0m                                 Traceback (most recent call last)",
+      "\u001B[0;32m/var/folders/nl/92kzg8c56mn1872898r7rjr40000gn/T/ipykernel_96458/1316156180.py\u001B[0m in \u001B[0;36m<module>\u001B[0;34m\u001B[0m\n\u001B[1;32m      1\u001B[0m \u001B[0;31m## Make sure the data is sorted according to date\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m----> 2\u001B[0;31m \u001B[0mresponse_data\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0msort_values\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'Date'\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0minplace\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;32mTrue\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m      3\u001B[0m \u001B[0mresponse_data\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mreset_index\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mdrop\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;32mTrue\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0minplace\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;32mTrue\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m      4\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m      5\u001B[0m \u001B[0;32mdef\u001B[0m \u001B[0mtime_series_plot\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mvar_to_plot\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mylab\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;34m''\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mxlab\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;34m'Date'\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mnum_x_ticks\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;36m4\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;31mNameError\u001B[0m: name 'response_data' is not defined"
      ]
     }
    ],