From 504a5cdf5fc9f7ff2d20b3bec65ee26daf3a7b4a Mon Sep 17 00:00:00 2001
From: emilyasterjones <emily.aster.jones@gmail.com>
Date: Wed, 26 Aug 2020 14:32:43 -0700
Subject: [PATCH] Added institution lookup on Google Scholar

Also changed how authors are iterated over so that institution lookups happen once per author instead of once per citation
---
 citation_overrepresentation_tool.ipynb | 75 ++++++++++++++++++++------
 1 file changed, 59 insertions(+), 16 deletions(-)

diff --git a/citation_overrepresentation_tool.ipynb b/citation_overrepresentation_tool.ipynb
index 1ee5783..0376b89 100644
--- a/citation_overrepresentation_tool.ipynb
+++ b/citation_overrepresentation_tool.ipynb
@@ -10,7 +10,8 @@
     "1. Extract your citations as a .bib file. Either extract all refs from a single folder in your citation manager or extract them straight from a Word doc if you use Mendeley or Zotero using [this tool](https://rintze.zelle.me/ref-extractor/).\n",
     "2. Upload your .bib file to the binder.\n",
     "3. Run each cell (Shift+Enter or Play button).\n",
-    "4. Optional: get an ORCID API key (instructions below) to extract institutions."
+    "4. Optional: extract institutions from Google Scholar\n",
+    "5. Optional: get an ORCID API key (instructions below) to extract institutions."
    ]
   },
   {
@@ -21,12 +22,14 @@
    "source": [
     "#imports\n",
     "!pip install pybtex\n",
+    "!pip install scholarly\n",
     "\n",
     "from pybtex.database.input import bibtex\n",
     "import glob\n",
     "import pandas as pd\n",
     "import requests\n",
-    "import re"
+    "import re\n",
+    "from scholarly import scholarly"
    ]
   },
   {
@@ -75,16 +78,58 @@
     "#build data frame & print\n",
     "auth_df = pd.DataFrame(authors, columns=['First Name','Last Name', 'Journal'])\n",
     "print('Overcited Authors')\n",
-    "print(auth_df.groupby(['First Name','Last Name']).size().sort_values(ascending=False).head(10))\n",
+    "names_grouped = auth_df.groupby(['First Name','Last Name']).size()\n",
+    "print(names_grouped.sort_values(ascending=False).head(10))\n",
     "print('\\nOvercited Journals')\n",
     "print(auth_df.groupby(['Journal']).size().sort_values(ascending=False).head(10))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Optional: Get institutions from Google Scholar\n",
+    "Using the [Scholarly package](https://pypi.org/project/scholarly/), query Google Scholar for institutions. Limitations: assumes first hit is correct, may need to use method get_proxy if too many requests."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#match each author to an institutions\n",
+    "#NB this section calls Google Scholar once for each author, so it takes a while\n",
+    "inst_list = list()\n",
+    "for name in names_grouped.axes[0]:\n",
+    "    search_query = scholarly.search_author(name[0]+' '+name[1])\n",
+    "    try:\n",
+    "        author = next(search_query).fill()\n",
+    "        affil = re.sub('Professor .+, ','',author.affiliation)\n",
+    "        inst_list.append(affil)\n",
+    "    except:\n",
+    "        inst_list.append('Undetermined')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#add to author counts series and print\n",
+    "inst_series = pd.Series(names_grouped.values, index = inst_list)\n",
+    "print('Overcited Institutions')\n",
+    "print(inst_series.sort_values(ascending=False).head(10))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### Optional: Get institutions from ORCID\n",
+    "Using the ORCID API, query ORCID for institutions. Limitations: assumes first hit is correct, many authors have empty ORCIDs.\n",
+    "\n",
     "Get an [ORCID API client ID & secret](https://support.orcid.org/hc/en-us/articles/360006897174)\n",
     "\n",
     "You can learn more about how to [search for an ORCID](https://members.orcid.org/api/tutorial/search-orcid-registry) and [find info about an author given their ORCID](https://members.orcid.org/api/tutorial/read-orcid-records) in the API documentation."
@@ -134,9 +179,9 @@
     "#find ORCIDs\n",
     "#NB this section calls the API once for each author, so it takes a while\n",
     "orcid_list = list()\n",
-    "for index, row in auth_df.iterrows():\n",
-    "    given = re.sub(' ','%20',row['First Name'])\n",
-    "    family = re.sub(' ','%20',row['Last Name'])\n",
+    "for name in names_grouped.axes[0]:\n",
+    "    given = re.sub(' ','%20',name[0])\n",
+    "    family = re.sub(' ','%20',name[1])\n",
     "\n",
     "    #build search\n",
     "    url = \"https://pub.orcid.org/v3.0/search/?q=\" \\\n",
@@ -148,9 +193,7 @@
     "    if auth_id.json()['result'] is not None:\n",
     "        orcid_list.append(auth_id.json()['result'][0]['orcid-identifier']['path'])\n",
     "    else:\n",
-    "        orcid_list.append('')\n",
-    "\n",
-    "auth_df['ORCID'] = orcid_list"
+    "        orcid_list.append('')"
    ]
   },
   {
@@ -162,9 +205,9 @@
     "# get institution from ORCID\n",
     "#NB this section calls the API once for each author, so it takes a while\n",
     "inst_list = list()\n",
-    "for index, row in auth_df.iterrows():\n",
-    "    if len(row['ORCID'])>0:\n",
-    "        url = \"https://pub.orcid.org/v2.1/\" + row['ORCID'] + \"/record\"\n",
+    "for orcid in orcid_list:\n",
+    "    if len(orcid)>0:\n",
+    "        url = \"https://pub.orcid.org/v2.1/\" + orcid + \"/record\"\n",
     "        orcid_request = requests.get(url, headers=headers, timeout=None)\n",
     "        affil = orcid_request.json()['activities-summary']['employments']['employment-summary']\n",
     "        if len(affil)>0:\n",
@@ -182,10 +225,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#add to df & print\n",
-    "auth_df['Institution'] = inst_list\n",
-    "print('\\nOvercited Institutions')\n",
-    "print(auth_df.groupby(['Institution']).size().sort_values(ascending=False).head(10))"
+    "#add to author counts series and print\n",
+    "inst_series = pd.Series(names_grouped.values, index = inst_list)\n",
+    "print('Overcited Institutions')\n",
+    "print(inst_series.sort_values(ascending=False).head(10))"
    ]
   }
  ],