From 504a5cdf5fc9f7ff2d20b3bec65ee26daf3a7b4a Mon Sep 17 00:00:00 2001 From: emilyasterjones Date: Wed, 26 Aug 2020 14:32:43 -0700 Subject: [PATCH] Added institution lookup on Google Scholar Also changed how authors are iterated over so that institution lookups happen once per author instead of once per citation --- citation_overrepresentation_tool.ipynb | 75 ++++++++++++++++++++------ 1 file changed, 59 insertions(+), 16 deletions(-) diff --git a/citation_overrepresentation_tool.ipynb b/citation_overrepresentation_tool.ipynb index 1ee5783..0376b89 100644 --- a/citation_overrepresentation_tool.ipynb +++ b/citation_overrepresentation_tool.ipynb @@ -10,7 +10,8 @@ "1. Extract your citations as a .bib file. Either extract all refs from a single folder in your citation manager or extract them straight from a Word doc if you use Mendeley or Zotero using [this tool](https://rintze.zelle.me/ref-extractor/).\n", "2. Upload your .bib file to the binder.\n", "3. Run each cell (Shift+Enter or Play button).\n", - "4. Optional: get an ORCID API key (instructions below) to extract institutions." + "4. Optional: extract institutions from Google Scholar\n", + "5. Optional: get an ORCID API key (instructions below) to extract institutions." ] }, { @@ -21,12 +22,14 @@ "source": [ "#imports\n", "!pip install pybtex\n", + "!pip install scholarly\n", "\n", "from pybtex.database.input import bibtex\n", "import glob\n", "import pandas as pd\n", "import requests\n", - "import re" + "import re\n", + "from scholarly import scholarly" ] }, { @@ -75,16 +78,58 @@ "#build data frame & print\n", "auth_df = pd.DataFrame(authors, columns=['First Name','Last Name', 'Journal'])\n", "print('Overcited Authors')\n", - "print(auth_df.groupby(['First Name','Last Name']).size().sort_values(ascending=False).head(10))\n", + "names_grouped = auth_df.groupby(['First Name','Last Name']).size()\n", + "print(names_grouped.sort_values(ascending=False).head(10))\n", "print('\\nOvercited Journals')\n", "print(auth_df.groupby(['Journal']).size().sort_values(ascending=False).head(10))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Optional: Get institutions from Google Scholar\n", + "Using the [Scholarly package](https://pypi.org/project/scholarly/), query Google Scholar for institutions. Limitations: assumes first hit is correct, may need to use method get_proxy if too many requests." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#match each author to an institutions\n", + "#NB this section calls Google Scholar once for each author, so it takes a while\n", + "inst_list = list()\n", + "for name in names_grouped.axes[0]:\n", + " search_query = scholarly.search_author(name[0]+' '+name[1])\n", + " try:\n", + " author = next(search_query).fill()\n", + " affil = re.sub('Professor .+, ','',author.affiliation)\n", + " inst_list.append(affil)\n", + " except:\n", + " inst_list.append('Undetermined')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#add to author counts series and print\n", + "inst_series = pd.Series(names_grouped.values, index = inst_list)\n", + "print('Overcited Institutions')\n", + "print(inst_series.sort_values(ascending=False).head(10))" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Optional: Get institutions from ORCID\n", + "Using the ORCID API, query ORCID for institutions. Limitations: assumes first hit is correct, many authors have empty ORCIDs.\n", + "\n", "Get an [ORCID API client ID & secret](https://support.orcid.org/hc/en-us/articles/360006897174)\n", "\n", "You can learn more about how to [search for an ORCID](https://members.orcid.org/api/tutorial/search-orcid-registry) and [find info about an author given their ORCID](https://members.orcid.org/api/tutorial/read-orcid-records) in the API documentation." @@ -134,9 +179,9 @@ "#find ORCIDs\n", "#NB this section calls the API once for each author, so it takes a while\n", "orcid_list = list()\n", - "for index, row in auth_df.iterrows():\n", - " given = re.sub(' ','%20',row['First Name'])\n", - " family = re.sub(' ','%20',row['Last Name'])\n", + "for name in names_grouped.axes[0]:\n", + " given = re.sub(' ','%20',name[0])\n", + " family = re.sub(' ','%20',name[1])\n", "\n", " #build search\n", " url = \"https://pub.orcid.org/v3.0/search/?q=\" \\\n", @@ -148,9 +193,7 @@ " if auth_id.json()['result'] is not None:\n", " orcid_list.append(auth_id.json()['result'][0]['orcid-identifier']['path'])\n", " else:\n", - " orcid_list.append('')\n", - "\n", - "auth_df['ORCID'] = orcid_list" + " orcid_list.append('')" ] }, { @@ -162,9 +205,9 @@ "# get institution from ORCID\n", "#NB this section calls the API once for each author, so it takes a while\n", "inst_list = list()\n", - "for index, row in auth_df.iterrows():\n", - " if len(row['ORCID'])>0:\n", - " url = \"https://pub.orcid.org/v2.1/\" + row['ORCID'] + \"/record\"\n", + "for orcid in orcid_list:\n", + " if len(orcid)>0:\n", + " url = \"https://pub.orcid.org/v2.1/\" + orcid + \"/record\"\n", " orcid_request = requests.get(url, headers=headers, timeout=None)\n", " affil = orcid_request.json()['activities-summary']['employments']['employment-summary']\n", " if len(affil)>0:\n", @@ -182,10 +225,10 @@ "metadata": {}, "outputs": [], "source": [ - "#add to df & print\n", - "auth_df['Institution'] = inst_list\n", - "print('\\nOvercited Institutions')\n", - "print(auth_df.groupby(['Institution']).size().sort_values(ascending=False).head(10))" + "#add to author counts series and print\n", + "inst_series = pd.Series(names_grouped.values, index = inst_list)\n", + "print('Overcited Institutions')\n", + "print(inst_series.sort_values(ascending=False).head(10))" ] } ],