nih-fmrif · riddlet · Aug 8, 2020 · Sep 29, 2020 · Sep 29, 2020 · Sep 29, 2020
diff --git a/01.0-TAR-irp_pubcount.ipynb b/01.0-TAR-irp_pubcount.ipynb
@@ -0,0 +1,336 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import glob\n",
+    "import requests\n",
+    "import time\n",
+    "import warnings\n",
+    "warnings.filterwarnings(action='ignore')\n",
+    "SCOPUS_API_KEY = 'Get_yer_own_stinkin_key'\n",
+    "SCOPUS_SEARCH_API_URL = \"http://api.elsevier.com/content/search/scopus\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Helper functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_h_index(citations):\n",
+    "    sortlist = sorted(citations, reverse = True)\n",
+    "    N = len(citations)\n",
+    "    i = 0\n",
+    "    while i<N and sortlist[i] >= (i+1):\n",
+    "        i += 1\n",
+    "    return i\n",
+    "\n",
+    "\n",
+    "def scopus_query_citation_count(eidlist):\n",
+    "    SCOPUS_API_KEY = 'Get_yer_own_stinkin_key'\n",
+    "    SCOPUS_SEARCH_API_URL = \"http://api.elsevier.com/content/search/scopus\"\n",
+    "    \"\"\"Take a list of EIDs and return a dictionary of EID -> citation_count\n",
+    "    mapping.\"\"\"\n",
+    "    eiddict = {}\n",
+    "    failed = []\n",
+    "    failed_count = 0\n",
+    "    max_failed_attempts = 5\n",
+    "    headers = {\"X-ELS-APIKey\": SCOPUS_API_KEY}\n",
+    "\n",
+    "    sys.stdout.write(\"[+] Query EIDs from SCOPUS Search API\")\n",
+    "    sys.stdout.flush()\n",
+    "\n",
+    "    for eid in eidlist:\n",
+    "        print(eid)\n",
+    "        params = {\n",
+    "            \"field\": \"eid,citedby-count\",\n",
+    "            \"query\": \"eid(\" + eid + \")\"\n",
+    "        }\n",
+    "\n",
+    "        try:\n",
+    "            r = requests.get(\n",
+    "                SCOPUS_SEARCH_API_URL,\n",
+    "                params=params,\n",
+    "                headers=headers,\n",
+    "                timeout=3\n",
+    "            )\n",
+    "        except requests.exceptions.Timeout:\n",
+    "            sys.stdout.write(\"T\")\n",
+    "            sys.stdout.flush()\n",
+    "            failed.append(eid)\n",
+    "            time.sleep(2)\n",
+    "            continue\n",
+    "\n",
+    "        if r.status_code != 200:\n",
+    "            sys.stdout.write(\"F\")\n",
+    "            sys.stdout.flush()\n",
+    "\n",
+    "            if eid not in failed:\n",
+    "                failed.append(eid)\n",
+    "\n",
+    "            time.sleep(2)\n",
+    "            continue\n",
+    "\n",
+    "        body = r.json()\n",
+    "        results = body.get(\"search-results\")\n",
+    "\n",
+    "        if results is None:\n",
+    "            sys.stdout.write(\"N\")\n",
+    "            sys.stdout.flush()\n",
+    "            continue\n",
+    "\n",
+    "        # Extract information for each result on this page\n",
+    "        entry = results.get(\"entry\")[0]\n",
+    "        eid_item = entry.get(\"eid\")\n",
+    "        citedby_count = entry.get(\"citedby-count\", \"0\")\n",
+    "\n",
+    "        if eid_item:\n",
+    "            eiddict.update({eid: citedby_count})\n",
+    "            sys.stdout.write(\".\")\n",
+    "            sys.stdout.flush()\n",
+    "\n",
+    "    return eiddict, failed"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## first, load in the old file\n",
+    "I'm loading in the 2019 file, and then computing all the metrics needed for the main paragraph. To do that, I also need to pull in a file that has the IC for each investigator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_2019 = pd.read_csv('data/complete/2019_complete.csv')\n",
+    "n_papes = df_2019.DOI.shape[0] \n",
+    "all_cites = df_2019['Cited-By Count'].sum() \n",
+    "citations = df_2019['Cited-By Count'].tolist() \n",
+    "h_index = compute_h_index(citations)\n",
+    "#Here i bring in the IC information\n",
+    "df_inv_ic = pd.read_csv('data/raw/investigator_ics.csv')\n",
+    "df_inv_ic['PI'] = df_inv_ic['Author Name'].str.extract(r'(\\s\\S+$)')\n",
+    "df_inv_ic['PI'] = df_inv_ic['PI'].str.strip()\n",
+    "\n",
+    "df_2019['PI'] = df_2019['SearchedAuthor']\n",
+    "df_2019.PI.fillna(df_2019['PI'], inplace=True)\n",
+    "df_2019.replace('Faith Berman', 'Berman', inplace=True)\n",
+    "df_2019.replace('Mcfarland', 'McFarland', inplace=True)\n",
+    "df_2019 = df_2019.merge(df_inv_ic[['PI', 'IC']].drop_duplicates(), on='PI', how='left', )\n",
+    "\n",
+    "\n",
+    "gb_s = df_2019.groupby('IC').size()\n",
+    "nimh = gb_s[4]\n",
+    "ninds = gb_s[5]\n",
+    "other = gb_s[0] + gb_s[1] + gb_s[2] + gb_s[3] "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nimh_2019 = df_2019[df_2019.IC=='NIMH'].EID.tolist()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Paragraph below checks out. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Since its inception in 2000 until August 2019, a total of 1196 peer-reviewed publications from intramural investigators have used data acquired in the FMRIF core facility. The total is distributed among 681 papers from NIMH, 337 papers from NINDS, and 149 from the other institutes. These papers have been cited a total of 128887 times for a combined h-index of 175. In other words, 175 papers using the FMRIF have been cited at least 175 times.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"Since its inception in 2000 until August 2019, a total of {n_papes} peer-reviewed publications from \\\n",
+    "intramural investigators have used data acquired in the FMRIF core facility. The total is distributed among \\\n",
+    "{nimh} papers from NIMH, {ninds} papers from NINDS, and {other} from the other institutes. These papers \\\n",
+    "have been cited a total of {all_cites} times for a combined h-index of {h_index}. In other words, {h_index} \\\n",
+    "papers using the FMRIF have been cited at least {h_index} times.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Now time to generate and distribute new papers for marking by PIs. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_2020 = pd.read_csv('data/interim/output_file.csv') \n",
+    "df_2020 = df_2020[~df_2020['EID'].isin(df_2019.EID.tolist())]\n",
+    "df_2020.rename(columns={'PI':'Searched PI',\n",
+    "                    'Authors' : 'Authors (scrambled)'}, inplace=True)\n",
+    "df_2020['year'] = df_2020.Date.apply(lambda x: x[0:4])\n",
+    "df_2020 = df_2020[df_2020.year.isin(['2019', '2020'])]\n",
+    "df_2020 = df_2020.drop('year', axis=1)\n",
+    "df_2020.to_csv('data/interim/2020_new_papers.csv', index=False) # new papers to mark"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Update citation counts from last year's report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_2019_updated = pd.read_csv('2019_complete_update_2020.csv') #results of update_citation_counts\n",
+    "df_2019_updated.rename(columns={'PI':'Searched PI',\n",
+    "                    'Authors' : 'Authors (scrambled)'}, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### get new papers that have been marked"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_newpapers = pd.read_csv('data/interim/New Publications from FMRIF Investigators Aug 2020 - 2020_new_papers.csv') #googledoc results\n",
+    "df_newpapers = df_newpapers[df_newpapers['FMRIF?']=='Y']\n",
+    "df_newpapers.drop('FMRIF?', inplace=True, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_2020_report_papers = pd.concat([df_2019_updated, df_newpapers])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_papes = df_2020_report_papers.DOI.shape[0]\n",
+    "all_cites = df_2020_report_papers['Cited-By Count'].sum()\n",
+    "citations = df_2020_report_papers['Cited-By Count'].tolist()\n",
+    "h_index = compute_h_index(citations)\n",
+    "#Here i bring in the IC information\n",
+    "df_inv_ic = pd.read_csv('data/raw/investigator_ics.csv')\n",
+    "df_inv_ic['PI'] = df_inv_ic['Author Name'].str.extract(r'(\\s\\S+$)')\n",
+    "df_inv_ic['PI'] = df_inv_ic['PI'].str.strip()\n",
+    "\n",
+    "df_2020_report_papers['PI'] = df_2020_report_papers['Searched PI']\n",
+    "df_2020_report_papers.PI.fillna(df_2020_report_papers['PI'], inplace=True)\n",
+    "df_2020_report_papers.replace('Faith Berman', 'Berman', inplace=True)\n",
+    "df_2020_report_papers.replace('Mcfarland', 'McFarland', inplace=True)\n",
+    "df_2020_report_papers = df_2020_report_papers.merge(df_inv_ic[['PI', 'IC']].drop_duplicates(), on='PI', how='left', )\n",
+    "\n",
+    "\n",
+    "gb_s = df_2020_report_papers.groupby('IC').size()\n",
+    "nimh = gb_s[4]\n",
+    "ninds = gb_s[5]\n",
+    "other = gb_s[0] + gb_s[1] + gb_s[2] + gb_s[3] "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Since its inception in 2000 until August 2020, a total of 1222 peer-reviewed publications from intramural investigators have used data acquired in the FMRIF core facility. The total is distributed among 700 papers from NIMH, 345 papers from NINDS, and 148 from the other institutes. These papers have been cited a total of 140641 times for a combined h-index of 185. In other words, 185 papers using the FMRIF have been cited at least 185 times.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"Since its inception in 2000 until August 2020, a total of {n_papes} peer-reviewed publications from \\\n",
+    "intramural investigators have used data acquired in the FMRIF core facility. The total is distributed among \\\n",
+    "{nimh} papers from NIMH, {ninds} papers from NINDS, and {other} from the other institutes. These papers \\\n",
+    "have been cited a total of {all_cites} times for a combined h-index of {h_index}. In other words, {h_index} \\\n",
+    "papers using the FMRIF have been cited at least {h_index} times.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_2020_report_papers.to_csv('data/complete/2020_complete.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/README.md b/README.md
@@ -2,11 +2,23 @@
 
 An application to catalog and measure NIH IRP publications.
 
-Assuming that the user is interested in updating these measures in subsequent years, it is a two-stage process.
+Assuming that the user is interested in updating these measures in subsequent years, follow these steps:
 
-In the first stage, the `scopus_search.py` script takes as input a list of investigators, and returns all the papers listed in scopus since the specified date.
+1. Check the authors in `data/raw/au_ids.txt` to ensure all PIs are represented. [Note1: we have discovered that some of the EIDs in this file returned data from investigators not in the IRP]
 
-In the second stage, the `update_citation_counts.py` script takes as input the file produced in the previous year, and pulls all the unique EID (paper identifiers), updating the citation counts for those articles. We have experienced EIDs that change a bit year-to-year (on the order of one or two percent). Consequently, this script will print (and write to disk) EIDs that are missed. It is up to the user to go through an manually correct these by, for instance, creating a new csv file with a column labeled `EID` and using this as input for `update_citation_counts.py`. 
+2. Obtain a listing of the previously used publications in this analysis (henceforth: *old publications*. See complete files at `/data/complete`).
+
+3. Use the `scopus_search.py` script to obtain all the papers listed in scopus since a given date (henceforth: *new publications*). This script takes as input a list of investigators, and a cutoff date. An example output from this script is located at `/data/interim/output_file.csv`. [Note2: we have found that this function still returns some papers that precede the cutoff date]
+
+4. Clean up the new publications with the `01.0-TAR-irp_pubcount.ipynb` notebook (described further below), and distribute to the investigators so they can mark which papers of theirs used the scanners. [here](https://docs.google.com/spreadsheets/d/1w_00-0GV0OHPJxf1FsTi4oVQzbqitz6xmB5-1qMx-vQ/edit#gid=339438628) is a version of this spreadsheet from 2019. 
+
+5. Use the `update_citation_counts.py` script to update the number of citations for the *old publications*. This script takes as input the file produced in the previous year, and pulls all the unique EID (paper identifiers), updating the citation counts for those articles. We have experienced EIDs that change a bit year-to-year (on the order of one or two percent). Consequently, this script will print (and write to disk) EIDs that are missed. It is up to the user to go through an manually correct these by, for instance, creating a new csv file with a column labeled `EID` and using this as input for a second run of `update_citation_counts.py`. The 2020 update from the 2019 papers is found at `data/interim/2019_complete_update_2020.csv`.
+
+6. Download the completed tabulation of which *new publications* used the scanners from the google doc. The csv from 2020 is found at `/data/interim/New Publications from FMRIF Investigators Aug 2020 - 2020_new_papers`.
+
+7. The notebook `01.0-TAR-irp_pubcount.ipynb` aggregates the data from the previous steps and generates a paragraph that describes the "productivity" of the reseaerch group as a whole. It also writes out a complete listing of papers for use next year. This notebook will rely on a PI <-> IC linking file (`/data/raw/investigator_ics.csv`)
+
+There have been uneven historical attempts to remove editorials, reviews, errata, and the like. The above does not attempt to describe how that might happen.
 
 ### scopus_search.py