From 226062cdee4d8c2c13c5ceba4eb2df69fff3b43a Mon Sep 17 00:00:00 2001
From: Patrick Golden <ptgolden0@gmail.com>
Date: Mon, 21 Oct 2024 18:26:31 -0400
Subject: [PATCH] Rewrite the get_publication script (#832)

This overhauls the script to update publications from Google Scholar.
The previous script worked, but it had some drawbacks, namely that it
required manually editing JSON with updated information. The new script
does not, and it outputs data in a format that will create meaningful
diffs with the existing file format when data is updated.

Changes:

  * Moves from argparse to typer for dealing with CLI arguments.

* Separates fetching data from generating the publications.json file
(this was necessary for development to prevent having to hit Google
Scholar on every change).

* Removes need to filter publications based on year-- Google Scholar is
now taken as the single source of truth for publications.

* Takes into account publications whose years have changed in Google
Scholar versus what has been already included in the publications.json
file.

* Better record matching that ignores differences in non-alphanumeric
characters. Previously, there were several false positives that had to
do with changes in punctuation.

  * Logging to stderr rather than an outputted report.

* Single source of truth for adding links to publications without them
in Google Scholar. (Previously, they had to be input by hand in the
produced JSON file).

---------

Co-authored-by: Kevin Schaper <kevinschaper@gmail.com>
---
 Makefile                                   |   2 +-
 frontend/src/pages/about/publications.json | 204 ++++++---
 scripts/get_publications.py                | 486 +++++++++++++--------
 3 files changed, 448 insertions(+), 244 deletions(-)

diff --git a/Makefile b/Makefile
index c9a6cdbef..86cd276b1 100644
--- a/Makefile
+++ b/Makefile
@@ -108,7 +108,7 @@ data:
 	@echo "Generating frontpage metadata..."
 	$(RUN) python scripts/generate_fixtures.py --metadata
 	@echo "Generating publications data..."
-	$(RUN) python scripts/get_publications.py --update
+	$(RUN) python scripts/get_publications.py update --update-data
 	@echo "Generating resources data..."
 	wget https://raw.githubusercontent.com/monarch-initiative/monarch-documentation/main/src/docs/resources/monarch-app-resources.json -O frontend/src/pages/resources/resources.json
 	make format-frontend
diff --git a/frontend/src/pages/about/publications.json b/frontend/src/pages/about/publications.json
index 17b15ecb3..49ab2c527 100644
--- a/frontend/src/pages/about/publications.json
+++ b/frontend/src/pages/about/publications.json
@@ -1,30 +1,26 @@
 {
   "metadata": {
-    "total": 13905,
-    "num_publications": 142,
-    "last_5_yrs": 9075,
+    "total": 12913,
+    "num_publications": 146,
+    "last_5_yrs": 9701,
     "cites_per_year": {
-      "2009": 43,
-      "2010": 90,
-      "2011": 146,
-      "2012": 238,
-      "2013": 243,
-      "2014": 349,
-      "2015": 696,
-      "2016": 792,
-      "2017": 984,
-      "2018": 1139,
-      "2019": 1385,
-      "2020": 1288,
-      "2021": 1557,
-      "2022": 2114,
-      "2023": 2245,
-      "2024": 452
+      "2013": 45,
+      "2014": 170,
+      "2015": 480,
+      "2016": 632,
+      "2017": 834,
+      "2018": 946,
+      "2019": 1213,
+      "2020": 1177,
+      "2021": 1406,
+      "2022": 1912,
+      "2023": 2041,
+      "2024": 1908
     },
-    "hindex": 48,
-    "hindex5y": 42,
-    "i10index": 96,
-    "i10index5y": 92
+    "hindex": 50,
+    "hindex5y": 46,
+    "i10index": 97,
+    "i10index5y": 93
   },
   "publications": [
     {
@@ -45,24 +41,11 @@
           "journal": "BMC Medical Informatics and Decision Making",
           "issue": "24(1):30",
           "link": "https://link.springer.com/article/10.1186/s12911-024-02439-w"
-        }
-      ]
-    },
-    {
-      "year": 2023,
-      "items": [
-        {
-          "title": "De novo TRPM3 missense variant associated with neurodevelopmental delay and manifestations of cerebral palsy",
-          "authors": "Jagadish Chandrabose Sundaramurthi, Anita M Bagley, Hannah Blau, Leigh Carmody, Amy Crandall, Daniel Danis, Michael A Gargano, Anxhela Gjyshi Gustafson, Ellen M Raney, Mallory Shingle, Jon R Davids, Peter N Robinson",
-          "year": 2023,
-          "journal": "Molecular Case Studies",
-          "issue": "9(4):a006293",
-          "link": "https://molecularcasestudies.cshlp.org/content/9/4/a006293.short"
         },
         {
           "title": "The Human Phenotype Ontology in 2024: phenotypes around the world",
           "authors": "Michael A Gargano, Nicolas Matentzoglu, Ben Coleman, Eunice B Addo-Lartey, Anna V Anagnostopoulos, Joel Anderton, Paul Avillach, Anita M Bagley, Eduard Bak\u0161tein, James P Balhoff, Gareth Baynam, Susan M Bello, Michael Berk, Holli Bertram, Somer Bishop, Hannah Blau, David F Bodenstein, Pablo Botas, Kaan Boztug, Jolana \u010cady, Tiffany J Callahan, Rhiannon Cameron, Seth J Carbon, Francisco Castellanos, J Harry Caufield, Lauren E Chan, Christopher G Chute, Jaime Cruz-Rojo, No\u00e9mi Dahan-Oliel, Jon R Davids, Maud de Dieuleveult, Vinicius de Souza, Bert BA de Vries, Esther de Vries, J Raymond DePaulo, Beata Derfalvi, Ferdinand Dhombres, Claudia Diaz-Byrd, Alexander JM Dingemans, Bruno Donadille, Michael Duyzend, Reem Elfeky, Shahim Essaid, Carolina Fabrizzi, Giovanna Fico, Helen V Firth, Yun Freudenberg-Hua, Janice M Fullerton, Davera L Gabriel, Kimberly Gilmour, Jessica Giordano, Fernando S Goes, Rachel Gore Moses, Ian Green, Matthias Griese",
-          "year": 2023,
+          "year": 2024,
           "journal": "Nucleic Acids Research",
           "issue": ":gkad1005",
           "link": "https://academic.oup.com/nar/advance-article-abstract/doi/10.1093/nar/gkad1005/7416384"
@@ -70,23 +53,15 @@
         {
           "title": "The Monarch Initiative in 2024: an analytic platform integrating phenotypes, genes and diseases across species",
           "authors": "Tim E Putman, Kevin Schaper, Nicolas Matentzoglu, Vincent P Rubinetti, Faisal S Alquaddoomi, Corey Cox, J Harry Caufield, Glass Elsarboukh, Sarah Gehrke, Harshad Hegde, Justin T Reese, Ian Braun, Richard M Bruskiewich, Luca Cappelletti, Seth Carbon, Anita R Caron, Lauren E Chan, Christopher G Chute, Katherina G Cortes, Vin\u00edcius De Souza, Tommaso Fontana, Nomi L Harris, Emily L Hartley, Eric Hurwitz, Julius OB Jacobsen, Madan Krishnamurthy, Bryan J Laraway, James A McLaughlin, Julie A McMurry, Sierra AT Moxon, Kathleen R Mullen, Shawn T O\u2019Neil, Kent A Shefchek, Ray Stefancsik, Sabrina Toro, Nicole A Vasilevsky, Ramona L Walls, Patricia L Whetzel, David Osumi-Sutherland, Damian Smedley, Peter N Robinson, Christopher J Mungall, Melissa A Haendel, Monica C Munoz-Torres",
-          "year": 2023,
+          "year": 2024,
           "journal": "Nucleic Acids Research",
           "issue": ":gkad1082",
           "link": "https://academic.oup.com/nar/advance-article-abstract/doi/10.1093/nar/gkad1082/7449493"
         },
-        {
-          "title": "The Medical Action Ontology: A Tool for Annotating and Analyzing Treatments and Clinical Management of Human Disease",
-          "authors": "Leigh C Carmody, Michael A Gargano, Sabrina Toro, Nicole A Vasilevsky, Margaret P Adam, Hannah Blau, Lauren E Chan, David Gomez-Andres, Rita Horvath, Markus S Ladewig, David Lewis-Smith, Hanns Lochmueller, Nicolas A Matentzoglu, Monica C Munoz-Torres, Catharina Schuetz, Megan L Kraus, Berthold Seitz, Morgan N Similuk, Teresa Sparks, Timmy Strauss, Emilia M Swietlik, Rachel Thompson, Xingmin Aaron Zhang, Christopher J Mungall, Melissa A Haendel, Peter N Robinson",
-          "year": 2023,
-          "journal": "medRxiv",
-          "issue": ":2023.07. 13.23292612",
-          "link": "https://www.medrxiv.org/content/10.1101/2023.07.13.23292612.abstract"
-        },
         {
           "title": "Predicting nutrition and environmental factors associated with female reproductive disorders using a knowledge graph and random forests",
           "authors": "Lauren E Chan, Elena Casiraghi, Timothy Putman, Justin Reese, Quaker E Harmon, Kevin Schaper, Harshad Hedge, Giorgio Valentini, Charles Schmitt, Alison Motsinger-Reif, Janet E Hall, Christopher J Mungall, Peter N Robinson, Melissa A Haendel",
-          "year": 2023,
+          "year": 2024,
           "journal": "medRxiv",
           "issue": ":2023.07. 14.23292679",
           "link": "https://www.medrxiv.org/content/10.1101/2023.07.14.23292679.abstract"
@@ -94,11 +69,136 @@
         {
           "title": "On the limitations of large language models in clinical diagnosis",
           "authors": "Justin Reese, Daniel Danis, J Harry Caufield, Elena Casiraghi, Giorgio Valentini, Christopher J Mungall, Peter N Robinson",
-          "year": 2023,
+          "year": 2024,
           "journal": "medRxiv",
           "issue": ":2023.07. 13.23292613",
           "link": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10370243/"
         },
+        {
+          "title": "Structured prompt interrogation and recursive extraction of semantics (SPIRES): A method for populating knowledge bases using zero-shot learning",
+          "authors": "J Harry Caufield, Harshad Hegde, Vincent Emonet, Nomi L Harris, Marcin P Joachimiak, Nicolas Matentzoglu, HyeongSik Kim, Sierra AT Moxon, Justin T Reese, Melissa A Haendel, Peter N Robinson, Christopher J Mungall",
+          "year": 2024,
+          "journal": "arXiv preprint arXiv:2304.02711",
+          "issue": "",
+          "link": "https://arxiv.org/abs/2304.02711"
+        },
+        {
+          "title": "A corpus of GA4GH Phenopackets: case-level phenotyping for genomic diagnostics and discovery",
+          "authors": "Daniel Danis, Michael J Bamshad, Yasemin Bridges, Pilar Cacheiro, Leigh C Carmody, Jessica X Chong, Ben Coleman, Raymond Dalgleish, Peter J Freeman, Adam SL Graefe, Tudor Groza, Julius OB Jacobsen, Adam Klocperk, Maaike Kusters, Markus S Ladewig, Anthony J Marcello, Teresa Mattina, Christopher J Mungall, Monica C Munoz-Torres, Justin T Reese, Filip Rehburg, Barbara CS Reis, Catharina Schuetz, Damian Smedley, Timmy Strauss, Jagadish Chandrabose Sundaramurthi, Sylvia Thun, Kyran Wissink, John F Wagstaff, David Zocche, Melissa A Haendel, Peter N Robinson",
+          "year": 2024,
+          "journal": "medRxiv",
+          "issue": ":2024.05. 29.24308104",
+          "link": "https://www.medrxiv.org/content/10.1101/2024.05.29.24308104.abstract"
+        },
+        {
+          "title": "Advancing diagnosis and research for rare genetic diseases in Indigenous peoples",
+          "authors": "Gareth Baynam, Daria Julkowska, Sarah Bowdin, Azure Hermes, Christopher R McMaster, Elissa Prichep, \u00c9tienne Richer, Francois H van der Westhuizen, Gabriela M Repetto, Helen Malherbe, Juergen KV Reichardt, Laura Arbour, Maui Hudson, Kelly du Plessis, Melissa Haendel, Phillip Wilcox, Sally Ann Lynch, Shamir Rind, Simon Easteal, Xavier Estivill, Nadine Caron, Meck Chongo, Yarlalu Thomas, Mary Catherine V Letinturier, Barend Christiaan Vorster",
+          "year": 2024,
+          "journal": "Nature Genetics",
+          "issue": "56(2):189-193",
+          "link": "https://www.nature.com/articles/s41588-023-01642-1"
+        },
+        {
+          "title": "Critical assessment of variant prioritization methods for rare disease diagnosis within the Rare Genomes Project",
+          "authors": "Sarah L Stenton, Melanie C O\u2019Leary, Gabrielle Lemire, Grace E VanNoy, Stephanie DiTroia, Vijay S Ganesh, Emily Groopman, Emily O\u2019Heir, Brian Mangilog, Ikeoluwa Osei-Owusu, Lynn S Pais, Jillian Serrano, Moriel Singer-Berk, Ben Weisburd, Michael W Wilson, Christina Austin-Tse, Marwa Abdelhakim, Azza Althagafi, Giulia Babbi, Riccardo Bellazzi, Samuele Bovo, Maria Giulia Carta, Rita Casadio, Pieter-Jan Coenen, Federica De Paoli, Matteo Floris, Manavalan Gajapathy, Robert Hoehndorf, Julius OB Jacobsen, Thomas Joseph, Akash Kamandula, Panagiotis Katsonis, Cyrielle Kint, Olivier Lichtarge, Ivan Limongelli, Yulan Lu, Paolo Magni, Tarun Karthik Kumar Mamidi, Pier Luigi Martelli, Marta Mulargia, Giovanna Nicora, Keith Nykamp, Vikas Pejaver, Yisu Peng, Thi Hong Cam Pham, Maurizio S Podda, Aditya Rao, Ettore Rizzo, Vangala G Saipradeep, Castrense Savojardo, Peter Schols, Yang Shen, Naveen Sivadasan, Damian Smedley, Dorian Soru, Rajgopal Srinivasan, Yuanfei Sun, Uma Sunderam, Wuwei Tan, Naina Tiwari, Xiao Wang, Yaqiong Wang, Amanda Williams, Elizabeth A Worthey, Rujie Yin, Yuning You, Daniel Zeiberg, Susanna Zucca, Constantina Bakolitsa, Steven E Brenner, Stephanie M Fullerton, Predrag Radivojac, Heidi L Rehm, Anne O\u2019Donnell-Luria",
+          "year": 2024,
+          "journal": "Human Genomics",
+          "issue": "18(1):44",
+          "link": "https://link.springer.com/article/10.1186/s40246-024-00604-w"
+        },
+        {
+          "title": "Evaluation of the Diagnostic Accuracy of GPT-4 in Five Thousand Rare Disease Cases",
+          "authors": "Justin T Reese, Leonardo Chimirri, Daniel Danis, J Harry Caufield, Kyran Wissink Wissink, Elena Casiraghi, Giorgio Valentini, Melissa A Haendel, Christopher J Mungall, Peter N Robinson",
+          "year": 2024,
+          "journal": "medRxiv",
+          "issue": ":2024.07. 22.24310816",
+          "link": "https://www.medrxiv.org/content/10.1101/2024.07.22.24310816.abstract"
+        },
+        {
+          "title": "FastHPOCR: pragmatic, fast, and accurate concept recognition using the human phenotype ontology",
+          "authors": "Tudor Groza, Dylan Gration, Gareth Baynam, Peter N Robinson",
+          "year": 2024,
+          "journal": "Bioinformatics",
+          "issue": "40(7)",
+          "link": "https://academic.oup.com/bioinformatics/article-abstract/40/7/btae406/7698025"
+        },
+        {
+          "title": "Gene set summarization using large language models",
+          "authors": "Marcin P Joachimiak, J Harry Caufield, Nomi L Harris, Hyeongsik Kim, Christopher J Mungall",
+          "year": 2024,
+          "journal": "ArXiv",
+          "issue": "",
+          "link": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10246080/"
+        },
+        {
+          "title": "Harnessing Consumer Wearable Digital Biomarkers for Individualized Recognition of Postpartum Depression Using the All of Us Research Program Data Set: Cross-Sectional Study",
+          "authors": "Eric Hurwitz, Zachary Butzin-Dozier, Hiral Master, Shawn T O'Neil, Anita Walden, Michelle Holko, Rena C Patel, Melissa A Haendel",
+          "year": 2024,
+          "journal": "JMIR mHealth and uHealth",
+          "issue": "12(1):e54622",
+          "link": "https://mhealth.jmir.org/2024/1/e54622/"
+        },
+        {
+          "title": "Improving prenatal diagnosis through standards and aggregation",
+          "authors": "Michael H Duyzend, Pilar Cacheiro, Julius OB Jacobsen, Jessica Giordano, Harrison Brand, Ronald J Wapner, Michael E Talkowski, Peter N Robinson, Damian Smedley",
+          "year": 2024,
+          "journal": "Prenatal diagnosis 44 (4), 454-464, 2024",
+          "issue": "44(4):454-464",
+          "link": "https://obgyn.onlinelibrary.wiley.com/doi/abs/10.1002/pd.6522"
+        },
+        {
+          "title": "Leveraging Generative AI to Accelerate Biocuration of Medical Actions for Rare Disease",
+          "authors": "Enock Niyonkuru, J Harry Caufield, Leigh Carmody, Michael Gargano, Sabrina Toro, Trish Whetzel, Hannah Blau, Mauricio Soto, Elena Casiraghi, Leonardo Chimirri, Justin T Reese, Giorgio Valentini, Melissa A Haendel, Christopher J Mungall, Peter N Robinson",
+          "year": 2024,
+          "journal": "medRxiv",
+          "issue": ":2024.08. 22.24310814",
+          "link": "https://www.medrxiv.org/content/10.1101/2024.08.22.24310814.abstract"
+        },
+        {
+          "title": "Replacing non-biomedical concepts improves embedding of biomedical concepts",
+          "authors": "Enock Niyonkuru, Mauricio Soto Gomez, Elena Casiraghi, Stephan Antogiovanni, Hannah Blau, Justin T Reese, Giorgio Valentini, Peter N Robinson",
+          "year": 2024,
+          "journal": "bioRxiv",
+          "issue": ":2024.07. 01.601556",
+          "link": "https://www.biorxiv.org/content/10.1101/2024.07.01.601556.abstract"
+        },
+        {
+          "title": "The Vertebrate Breed Ontology: Towards Effective Breed Data Standardization",
+          "authors": "Kathleen R Mullen, Imke Tammen, Nicolas A Matentzoglu, Marius Mather, Christopher J Mungall, Melissa A Haendel, Frank W Nicholas, Sabrina Toro",
+          "year": 2024,
+          "journal": "arXiv preprint arXiv:2406.02623",
+          "issue": "",
+          "link": "https://arxiv.org/abs/2406.02623"
+        },
+        {
+          "title": "Towards a standard benchmark for variant and gene prioritisation algorithms: PhEval-Phenotypic inference Evaluation framework",
+          "authors": "Yasemin S Bridges, Vinicius de Souza, Katherina G Cortes, Melissa Haendel, Nomi L Harris, Daniel R Korn, Nikolaos M Marinakis, Nicolas Matentzoglu, James A McLaughlin, Christopher J Mungall, David J Osumi-Sutherland, Peter N Robinson, Damian Smedley, Julius OB Jacobsen",
+          "year": 2024,
+          "journal": "bioRxiv",
+          "issue": ":2024.06. 13.598672",
+          "link": "https://www.biorxiv.org/content/10.1101/2024.06.13.598672.abstract"
+        }
+      ]
+    },
+    {
+      "year": 2023,
+      "items": [
+        {
+          "title": "De novo TRPM3 missense variant associated with neurodevelopmental delay and manifestations of cerebral palsy",
+          "authors": "Jagadish Chandrabose Sundaramurthi, Anita M Bagley, Hannah Blau, Leigh Carmody, Amy Crandall, Daniel Danis, Michael A Gargano, Anxhela Gjyshi Gustafson, Ellen M Raney, Mallory Shingle, Jon R Davids, Peter N Robinson",
+          "year": 2023,
+          "journal": "Molecular Case Studies",
+          "issue": "9(4):a006293",
+          "link": "https://molecularcasestudies.cshlp.org/content/9/4/a006293.short"
+        },
+        {
+          "title": "The Medical Action Ontology: A Tool for Annotating and Analyzing Treatments and Clinical Management of Human Disease",
+          "authors": "Leigh C Carmody, Michael A Gargano, Sabrina Toro, Nicole A Vasilevsky, Margaret P Adam, Hannah Blau, Lauren E Chan, David Gomez-Andres, Rita Horvath, Markus S Ladewig, David Lewis-Smith, Hanns Lochmueller, Nicolas A Matentzoglu, Monica C Munoz-Torres, Catharina Schuetz, Megan L Kraus, Berthold Seitz, Morgan N Similuk, Teresa Sparks, Timmy Strauss, Emilia M Swietlik, Rachel Thompson, Xingmin Aaron Zhang, Christopher J Mungall, Melissa A Haendel, Peter N Robinson",
+          "year": 2023,
+          "journal": "medRxiv",
+          "issue": ":2023.07. 13.23292612",
+          "link": "https://www.medrxiv.org/content/10.1101/2023.07.13.23292612.abstract"
+        },
         {
           "title": "The International Mouse Phenotyping Consortium: comprehensive knockout phenotyping underpinning the study of human disease",
           "authors": "Tudor Groza, Federico Lopez Gomez, Hamed Haseli Mashhadi, Violeta Mu\u00f1oz-Fuentes, Osman Gunes, Robert Wilson, Pilar Cacheiro, Anthony Frost, Piia Keskivali-Bond, Bora Vardal, Aaron McCoy, Tsz Kwan Cheng, Luis Santos, Sara Wells, Damian Smedley, Ann-Marie Mallon, Helen Parkinson",
@@ -123,14 +223,6 @@
           "issue": "4(1):2200016",
           "link": "https://onlinelibrary.wiley.com/doi/abs/10.1002/ggn2.202200016"
         },
-        {
-          "title": "Structured prompt interrogation and recursive extraction of semantics (SPIRES): A method for populating knowledge bases using zero-shot learning",
-          "authors": "J Harry Caufield, Harshad Hegde, Vincent Emonet, Nomi L Harris, Marcin P Joachimiak, Nicolas Matentzoglu, HyeongSik Kim, Sierra AT Moxon, Justin T Reese, Melissa A Haendel, Peter N Robinson, Christopher J Mungall",
-          "year": 2023,
-          "journal": "arXiv preprint arXiv:2304.02711",
-          "issue": "",
-          "link": "https://arxiv.org/abs/2304.02711"
-        },
         {
           "title": "The Ontology of Biological Attributes (OBA)\u2014computational traits for the life sciences",
           "authors": "Ray Stefancsik, James P Balhoff, Meghan A Balk, Robyn L Ball, Susan M Bello, Anita R Caron, Elissa J Chesler, Vinicius de Souza, Sarah Gehrke, Melissa Haendel, Laura W Harris, Nomi L Harris, Arwa Ibrahim, Sebastian Koehler, Nicolas Matentzoglu, Julie A McMurry, Christopher J Mungall, Monica C Munoz-Torres, Tim Putman, Peter Robinson, Damian Smedley, Elliot Sollis, Anne E Thessen, Nicole Vasilevsky, David O Walton, David Osumi-Sutherland",
diff --git a/scripts/get_publications.py b/scripts/get_publications.py
index 6b8b9c0ff..78674d826 100644
--- a/scripts/get_publications.py
+++ b/scripts/get_publications.py
@@ -1,62 +1,90 @@
-"""This script is intended to assist in updating the publications page of the Monarch website.
+"""
+This script is intended to assist in updating the publications page of the Monarch website.
 
 It uses the scholarly package to search for publications citing the Monarch Initiative and 
 metadata (counts of publications citing Monarch) from Google Scholar.
+"""
 
-It then writes the results to a json file, as well as a report containing:
-    - the total number of publications found by scholarly
-    - publications with no link (to be manually added)
-    - duplicates returned by scholarly
-    - publications that are already in the publications.json file
-
+from collections import defaultdict
+from functools import cache, reduce
+import itertools
+import json
+from pathlib import Path
+import re
+import sys
+from dataclasses import asdict, dataclass, replace, fields
+from loguru import logger
+from typing import DefaultDict, List, Optional
+from typing_extensions import Annotated
 
-Links for known pubs with no link:
-- Metrics to Assess Value of Biomedical Digital Repositories: 
-    https://zenodo.org/record/203295
+import typer
+from scholarly import Author, scholarly
 
-- The Monarch Initiative: Insights across species reveal human disease mechanisms: 
-    https://www.biorxiv.org/content/10.1101/055756v1
+app = typer.Typer()
 
-- k-BOOM: a Bayesian approach to ontology structure inference, with applications in disease ontology construction. bioRxiv 2019: 048843
-    https://www.biorxiv.org/content/10.1101/048843v3
-"""
-import argparse
-import json
-from pathlib import Path
-from typing import List
+# https://scholar.google.com/citations?user=zmUEDj0AAAAJ
+MONARCH_GOOGLE_SCHOLAR_ID = "zmUEDj0AAAAJ"
 
-from scholarly import scholarly  # type: ignore
-import pprint
+KNOWN_LINKS = {
+    "Metrics to Assess Value of Biomedical Digital Repositories": "https://zenodo.org/record/203295",
+    "The Monarch Initiative: Insights across species reveal human disease mechanisms": "https://www.biorxiv.org/content/10.1101/055756v1",
+    "k-BOOM: a Bayesian approach to ontology structure inference, with applications in disease ontology construction.  bioRxiv 2019: 048843": "https://www.biorxiv.org/content/10.1101/048843v3",
+    "The Human Phenotype Ontology in 2024: phenotypes around the world": "https://doi.org/10.1093/nar/gkad1005",
+    "Metrics to assess value of biomedical digital repositories: response to RFI NOT-OD-16-133": "https://zenodo.org/records/203295",
+    "The GA4GH Phenopacket schema: A computable representation of clinical data for precision medicine": "https://www.medrxiv.org/content/10.1101/2021.11.27.21266944v1",
+}
 
-pp = pprint.PrettyPrinter(indent=2, sort_dicts=False).pprint
 
-outdir = Path(__file__).parent.parent / "frontend" / "src" / "pages" / "about"
 script_dir = Path(__file__).parent
+default_metadata_file = script_dir / "metadata.json"
+default_scholarly_data = script_dir / "scholarly_output.json"
+default_publications_file = (
+    script_dir.parent / "frontend/src/pages/about/publications.json"
+)
+
+
+@dataclass
+class MonarchPublication:
+    title: str
+    authors: str
+    year: int
+    journal: str
+    issue: str
+    link: Optional[str]
+
+    def key(self):
+        return title_to_key(self.title)
+
+
+def title_to_key(title: str) -> str:
+    return re.sub(r"\W", "", title.lower())
+
+
+def replace_links(pubs: list[MonarchPublication]):
+    links_by_title = {pub.key(): pub for pub in pubs}
 
-pubs_file = outdir / "publications.json"
-new_pubs_file = script_dir / "new_pubs.json"
-report_file = script_dir / "pubs_report.txt"
-scholarly_file = script_dir / "scholarly_output.json"
+    for title, link in KNOWN_LINKS.items():
+        key = title_to_key(title)
+        if key in links_by_title:
+            links_by_title[key].link = link
 
-# These either aren't publications, are known duplicates, or have bad/missing info
-EXCLUDE = [
-]
+
+@cache
+def get_scholarly_author() -> Author:
+    author = scholarly.search_author_id(id=MONARCH_GOOGLE_SCHOLAR_ID)
+    scholarly.fill(author, sections=["basics", "indices", "counts", "publications"])  # type: ignore
+    return author
 
 
-def get_citation_metadata():
+def fetch_citation_metadata():
     """Get citation metadata from google scholar
 
     See https://scholarly.readthedocs.io/en/latest/DataTypes.html?highlight=hindex for details
     """
-    author = scholarly.search_author_id(id="zmUEDj0AAAAJ")
-    scholarly.fill(author, sections=["basics", "indices", "counts", "publications"])  # type: ignore
-    total = (
-        author["citedby"]
-        # - len([pub for pub in author["publications"] if pub["bib"]["title"] in EXCLUDE])
-        - len([pub for pub in author["publications"] if int(pub["bib"]["pub_year"]) < 2012])
-    )
+    author = get_scholarly_author()
+
     citation_info = {
-        "total": total,  # type: ignore
+        "total": author["citedby"],  # type: ignore
         "num_publications": len(author["publications"]),  # type: ignore
         "last_5_yrs": author["citedby5y"],  # type: ignore
         "cites_per_year": author["cites_per_year"],  # type: ignore
@@ -65,34 +93,32 @@ def get_citation_metadata():
         "i10index": author["i10index"],  # type: ignore
         "i10index5y": author["i10index5y"],  # type: ignore
     }
+
     return citation_info
 
 
-def get_pubs_from_scholarly():
+def fetch_scholarly_publications() -> List[MonarchPublication]:
     """Search for Monarch publications using scholarly"""
-    author = scholarly.search_author_id(id="zmUEDj0AAAAJ")
-    scholarly.fill(author, sections=["publications"], sortby="year")  # type: ignore
+    author = get_scholarly_author()
     publications = author["publications"]  # type: ignore
-    pubs = []
+    pubs: List[MonarchPublication] = []
+
     for p in publications:
         scholarly.fill(p, sections=["bib"])  # type: ignore
         bib = p["bib"]  # type: ignore
-        # if bib["title"] in EXCLUDE:  # type: ignore
-        #     continue
-        if "pub_year" not in bib or int(bib["pub_year"]) < 2012:
-            continue
+
         title = bib["title"]  # type: ignore
         authors = ", ".join(bib["author"].split(" and "))  # type: ignore
-        year = int(bib["pub_year"])
-        journal = (
-            bib["journal"]
-            if "journal" in bib
-            else bib["publisher"]
-            if "publisher" in bib
-            else bib["citation"]
-            if "citation" in bib
-            else ""
-        )
+        year = int(bib["pub_year"])  # type: ignore
+
+        journal = ""
+        if "journal" in bib:
+            journal = bib["journal"]
+        elif "publisher" in bib:
+            journal = bib["publisher"]
+        elif "citation" in bib:
+            journal = bib["citation"]
+
         issue = ""
         if "volume" in bib:
             issue += f"{bib['volume']}"
@@ -100,158 +126,244 @@ def get_pubs_from_scholarly():
             issue += f"({bib['number']})"
         if "pages" in bib:
             issue += f":{bib['pages']}"
-        link = f"{p['pub_url']}" if ("pub_url" in p and "scholar.google" not in p["pub_url"]) else ""
+
+        link = (
+            f"{p['pub_url']}"
+            if ("pub_url" in p and "scholar.google" not in p["pub_url"])
+            else None
+        )
+
         pubs.append(
-            {
-                "title": title,
-                "authors": authors,
-                "year": year,
-                "journal": journal,
-                "issue": issue,
-                "link": link,
-            }
+            MonarchPublication(
+                title=title,
+                authors=authors,
+                year=year,
+                journal=journal,
+                issue=issue,
+                link=link,
+            )
         )
+
     return pubs
 
 
-def pick_best(pub1, pub2):
+def combine_publications(pub1: MonarchPublication, pub2: MonarchPublication):
     """Return publication with most info from two publications"""
-    new_pub = {"title": pub1["title"]}
+    # Copy the publication from the first candidate
+    new_pub = replace(pub1)
+
     # pick year from publication with link
-    if pub1["link"]:
-        new_pub["year"] = pub1["year"]
-    else:
-        new_pub["year"] = pub2["year"]
+    if not pub1.link and pub2.link:
+        new_pub.link = pub2.link
+        new_pub.year = pub2.year
+
     # pick longer (non-empty) from each
-    for k, v in pub1.items():
-        if k in ["year", "title"]:
+    for f in fields(pub1):
+        if f.name in ["year", "title"]:
             continue
-        new_pub[k] = v if len(v) > len(pub2[k]) else pub2[k]
+        val1 = getattr(pub1, f.name)
+        val2 = getattr(pub2, f.name)
+        if val1 is None:
+            if val2 is not None:
+                setattr(new_pub, f.name, val2)
+        elif len(val2) > len(val1):
+            setattr(new_pub, f.name, val2)
+
     return new_pub
 
 
-def check_for_dups(publication_list: List[dict]):
+def dedup_publications(publication_list: List[MonarchPublication]):
     """Check for duplicate publications from scholarly and pick best info from each"""
-    checked = []
-    duplicates = []
-    for pub in publication_list:
-        if pub["title"].lower() not in [p["title"].lower() for p in checked]:
-            checked.append(pub)
+    publication_list = sorted(publication_list, key=lambda p: p.key())
+    pubs_by_key = itertools.groupby(publication_list, key=lambda p: p.key())
+
+    deduped: List[MonarchPublication] = []
+
+    for key, pubs in pubs_by_key:
+        pubs_list = list(pubs)
+        if len(pubs_list) == 1:
+            pub = pubs_list[0]
         else:
-            for ind, p in enumerate(checked):
-                if p["title"].lower() == pub["title"].lower():
-                    duplicates.append(pub["title"])
-                    checked[ind] = pick_best(p, pub)
-                    break
-    return checked, duplicates
+            logger.info(f"Found {len(pubs_list)} records for {key}")
+            pub = reduce(combine_publications, pubs)
+        deduped.append(pub)
+
+    return deduped
 
 
-def find_existing(current_data, new_data):
+def extend_current_pubs(
+    current_pubs: List[MonarchPublication], scholarly_pubs: List[MonarchPublication]
+):
     """Find publications in scholarly_data that are already in publications.json"""
-    existing_titles = [pub["title"].lower() for pub in current_data]
-    dups = [pub["title"] for pub in new_data if pub["title"].lower() in existing_titles]
-    filtered = [pub for pub in new_data if pub["title"].lower() not in existing_titles]
-    ### The below returns a different list than the above, but I'm not sure why ###
-    # dups = []
-    # for ind, pub in enumerate(new_data):
-    #     if pub["title"].lower() in existing_titles:
-    #         new_data.pop(ind)
-    #         dups.append(pub["title"])
-    return filtered, dups
-
-
-def write_citations(publications: List[dict], metadata: dict):
-    """Write JSON file with citations for each publication"""
-    # group by year
-    pubs_sorted = []
-    for pub in publications:
-        if pub["year"] not in [year["year"] for year in pubs_sorted]:
-            pubs_sorted.append({"year": pub["year"], "items": [pub]})
+    existing_by_key = {pub.key(): pub for pub in current_pubs}
+
+    new_pubs_ct = 0
+    updated_pubs_ct = 0
+    existing_pubs_ct = 0
+
+    new_pubs: list[MonarchPublication] = []
+
+    for pub in scholarly_pubs:
+        existing_pub = existing_by_key.get(pub.key(), None)
+
+        if existing_pub:
+            if pub.year > existing_pub.year:
+                logger.info(
+                    f"Updating publication year for {pub.title} from {existing_pub.year} to {pub.year}"
+                )
+                existing_pub.year = pub.year
+                updated_pubs_ct += 1
+            else:
+                existing_pubs_ct += 1
         else:
-            for year in pubs_sorted:
-                if year["year"] == pub["year"]:
-                    year["items"].insert(0, pub)
-                    break
-    output = {"metadata": metadata, "publications": pubs_sorted}
-    # write to output file
-    with open(new_pubs_file, "w") as f:
-        json.dump(output, f, indent=2)
-
-
-def write_report(report: List[str]):
-    """Write report of publications with no link or with Google Scholar link"""
-    with open(report_file, "w") as f:
-        f.write("\n".join(report))
-
-
-def main(update: bool):
-    """Main function"""
-    report = []
-    metadata = get_citation_metadata()
-
-    # Get publications from scholarly or existing file
-    Path(scholarly_file).touch()
-    with open(scholarly_file, "r+") as f:
-        if args.update:
-            scholarly_data = get_pubs_from_scholarly()
-            json.dump(scholarly_data, f, indent=2)
-        else:
-            scholarly_data = json.load(f)
-    report.append(f"{'-'*120}\nFound {len(scholarly_data)} publications in Google Scholar")
+            new_pubs.append(pub)
+            new_pubs_ct += 1
 
-    # Check for duplicate publications in scholarly_data
-    checked, dups = check_for_dups(scholarly_data)
-    if dups:
-        report.append(
-            f"{'-'*120}\nFound (and removed) {len(scholarly_data) - len(checked)} duplicate publications in scholarly_data:"
-        )
-        for pub in dups:
-            report.append(f"\n\t{pub}")
-        scholarly_data = checked
-
-    # Flag publications with no link (to manually edit in publications.json later)
-    nolinks = [pub["title"] for pub in scholarly_data if not pub["link"]]  # type: ignore
-    if nolinks:
-        report.append(f"{'-'*120}\nFound {len(nolinks)} publications with no link:")
-        for pub in nolinks:
-            report.append(f"\n\t{pub}")
-
-    # Filter out publications already in publications.json
-    if not Path(pubs_file).exists():
-        report.append(f"{'-'*120}\nNo publications.json file found. Creating one now...")
-        with open(pubs_file, "w") as f:
-            json.dump({"metadata": {}, "publications": []}, f, indent=2)
-    with open(pubs_file, "r") as f:
-        current_data = json.load(f)
-    current_pubs = [pub for year in current_data["publications"] for pub in year["items"]]
-    filtered, dups = find_existing(current_pubs, scholarly_data)
-    if dups:
-        report.append(
-            f"{'-'*120}\nFound {len(dups)} publications already in publications.json ({len(filtered)} new publications)\nDuplicates:"
-        )
-        for pub in dups:
-            report.append(f"\n\t{pub}")
-        report.append(f"\n{'-'*120}\nNew publications:")
-        for pub in filtered:
-            report.append(f"\n\t{pub['title']}")
+    logger.info(f"Untouched publications: {existing_pubs_ct}")
+    logger.info(f"Updated publications: {updated_pubs_ct}")
+    logger.info(f"New publications: {new_pubs_ct}")
 
-    citations = filtered
-    return citations, metadata, report
+    combined_pubs = current_pubs[:]
+    combined_pubs.extend(new_pubs)
 
+    return combined_pubs
 
-if __name__ == "__main__":
-    ### Parse arguments
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-d", "--debug", help="Debug mode", action="store_true", default=False)
-    parser.add_argument(
-        "-u", "--update", help="Update scholarly_output.json", default=True, action=argparse.BooleanOptionalAction
+
+def add_scholarly_publications(
+    scholarly_file: Path, existing_publications_file: Optional[Path]
+):
+    """Add scholarly publications to an existing publications.json file."""
+
+    with open(scholarly_file) as fd:
+        scholarly_pubs = [MonarchPublication(**pub_dict) for pub_dict in json.load(fd)]
+
+    if existing_publications_file is not None:
+        with open(existing_publications_file) as fd:
+            current_data = json.load(fd)
+            current_pubs = [
+                MonarchPublication(**pub)
+                for year in current_data["publications"]
+                for pub in year["items"]
+            ]
+    else:
+        current_pubs = []
+
+    # Check for duplicate publications in scholarly_data
+    scholarly_pubs = dedup_publications(scholarly_pubs)
+
+    # Add known missing links
+    replace_links(scholarly_pubs)
+
+    # Bail out if there are any missing links after adding replacements
+    missing_links = [pub for pub in scholarly_pubs if not pub.link]
+    if missing_links:
+        for pub in missing_links:
+            logger.error(f"No link for {pub.title}. Add in script before continuing.")
+        sys.exit(1)
+
+    logger.info(f"Existing publications in system: {len(current_pubs)}")
+    logger.info(f"Publications from Google Scholar: {len(scholarly_pubs)}")
+
+    # Return the scholarly publications to the current publications
+    return extend_current_pubs(current_pubs, scholarly_pubs)
+
+
+@app.command()
+def update(
+    metadata_file: Annotated[
+        Path,
+        typer.Argument(
+            exists=True,
+            file_okay=True,
+            dir_okay=False,
+            readable=True,
+        ),
+    ] = default_metadata_file,
+    publications_file: Annotated[
+        Path,
+        typer.Argument(
+            exists=True,
+            file_okay=True,
+            dir_okay=False,
+            readable=True,
+        ),
+    ] = default_scholarly_data,
+    existing_data_file: Annotated[
+        Optional[Path],
+        typer.Argument(
+            exists=True,
+            file_okay=True,
+            dir_okay=False,
+            readable=True,
+        ),
+    ] = default_publications_file,
+    output_file: Annotated[
+        Path,
+        typer.Option(
+            "--output",
+            "-o",
+            file_okay=True,
+            dir_okay=False,
+            readable=True,
+        ),
+    ] = default_publications_file,
+    update_data: bool = False
+):
+    if update_data:
+        fetch_metadata()
+        fetch_publications()
+
+    pubs = add_scholarly_publications(publications_file, existing_data_file)
+
+    # Convert publications to a dict and sort in such an order that the most recent year is first
+    pubs_by_year: DefaultDict[int, List[MonarchPublication]] = defaultdict(list)
+    for pub in pubs:
+        pubs_by_year[pub.year].append(pub)
+
+    publications_by_year = sorted(
+        [{"year": k, "items": [asdict(p) for p in v]} for k, v in pubs_by_year.items()],
+        key=lambda p: p["year"],
+        reverse=True,
     )
-    args = parser.parse_args()
 
-    # if args.debug:
-    #     get_citation_metadata()
-    #     sys.exit()
+    with metadata_file.open("r") as fd:
+        metadata = json.load(fd)
+
+    output = json.dumps(
+        {
+            "metadata": metadata,
+            "publications": publications_by_year,
+        },
+        indent=2,
+    )
+
+    with output_file.open("w") as fd:
+        fd.write(output)
+
 
-    citations, metadata, report = main(args.update)
-    write_citations(citations, metadata)
-    write_report(report)
+@app.command()
+def fetch_metadata(
+    outfile: Annotated[Path, typer.Option("--output", "-o")] = default_metadata_file,
+):
+    """Fetch the latest citation metadata from Google Scholar."""
+    citation_metadata = fetch_citation_metadata()
+    output = json.dumps(citation_metadata, indent=2)
+
+    with open(outfile, "w") as fd:
+        fd.write(output)
+
+
+@app.command()
+def fetch_publications(
+    outfile: Annotated[Path, typer.Option("--output", "-o")] = default_scholarly_data,
+):
+    """Fetch the latest publication list from Google Scholar."""
+    publications = fetch_scholarly_publications()
+    output = json.dumps([asdict(pub) for pub in publications], indent=2)
+
+    with open(outfile, "w") as fd:
+        fd.write(output)
+
+
+if __name__ == "__main__":
+    app()