Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update files related to parsing #4

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
100 changes: 100 additions & 0 deletions .ipynb_checkpoints/Untitled-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import functions\n",
"import glob"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"for pdf in glob.glob(\"*.pdf\"):\n",
" try:\n",
" functions.convert_pdf_to_txt(pdf)\n",
" except:\n",
" read(pdf,'rb')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"functions.convert_pdf_to_txt('N3865.pdf')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "TypeError",
"evalue": "ufunc 'add' did not contain a loop with signature matching types dtype('S93') dtype('S93') dtype('S93')",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-6-974a080e14c7>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mids\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mfunctions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconvert_pdf_to_txt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/Users/rhammond/MineForSpecimens/functions.py\u001b[0m in \u001b[0;36mconvert_pdf_to_txt\u001b[0;34m(itemId)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mconverts\u001b[0m \u001b[0mit\u001b[0m \u001b[0mto\u001b[0m \u001b[0ma\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mtxt\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 114\u001b[0m '''\n\u001b[0;32m--> 115\u001b[0;31m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msystem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'wget --no-check-certificate https://digitallibrary.amnh.org/rest/bitstreams/'\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitemId\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m'/retrieve -O '\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mitemId\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'.pdf'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 116\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0mrsrcmgr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPDFResourceManager\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mTypeError\u001b[0m: ufunc 'add' did not contain a loop with signature matching types dtype('S93') dtype('S93') dtype('S93')"
]
}
],
"source": [
"year = 2016\n",
"ids = functions.getIdsFromPeriod(year)\n",
"\n",
"for i in ids:\n",
" functions.convert_pdf_to_txt(i)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
226 changes: 226 additions & 0 deletions .ipynb_checkpoints/hacks-checkpoint.ipynb

Large diffs are not rendered by default.

1,508 changes: 1,508 additions & 0 deletions .ipynb_checkpoints/pdf_parse-checkpoint.ipynb

Large diffs are not rendered by default.

852 changes: 852 additions & 0 deletions .ipynb_checkpoints/testing-checkpoint.ipynb

Large diffs are not rendered by default.

Binary file modified N3849.pdf
Binary file not shown.
Binary file added N3865.pdf
Binary file not shown.
161 changes: 161 additions & 0 deletions Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import functions\n",
"import glob\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"for pdf in glob.glob(\"*.pdf\"):\n",
" try:\n",
" functions.convert_pdf_to_txt(pdf)\n",
" except:\n",
" read(pdf,'rb')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"functions.convert_pdf_to_txt('N3865.pdf')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"year = 2015\n",
"ids = functions.getIdsFromPeriod(year)\n",
"\n",
"for i in ids:\n",
" try:\n",
" functions.convert_pdf_to_txt(i)\n",
" except:\n",
" os.system('wget --no-check-certificate https://digitallibrary.amnh.org/rest/bitstreams/'+str(i))\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"44 6706\n"
]
},
{
"data": {
"text/plain": [
"{'authors': [u'Soto-Centeno, J. Angel.',\n",
" u\"O'Brien, Margaret (Margaret Elizabeth)\",\n",
" u'Simmons, Nancy B.'],\n",
" u'dc.date.available': u'2015-12-29T16:13:58Z',\n",
" u'dc.date.issued': u'2015-12-28',\n",
" u'dc.description': u'32 pages : color illustrations, maps (some color) ; 26 cm.',\n",
" u'dc.description.abstract': u'The bat family Mormoopidae includes three species with distributions in the Caribbean. These taxa--Mormoops blainvillei, Pteronotus parnellii, and P. quadridens--roost predominantly in hot cave chambers where temperatures may reach 40\\xb0 C and humidity is close to 100%. We tested the hypothesis that mormoopid bat extirpations in this region were due to climatic changes and the loss of suitable cave environments due to flooding caused by sea level rise associated with the late Pleistocene to Holocene (ca. 10 ka) climate change transition. Ecological niche models (ENMs) were developed to estimate the current, mid-Holocene, and Last Glacial Maximum distributions of these three bat species and to assess whether suitable climatic habitat for these taxa had been stable across time in the Caribbean. Additionally, we examined the importance of karst formations (where hot caves typically form) as a predictor for the distributions of Caribbean mormoopid bats. Our results show that mormoopid bat distributions in the Caribbean have remained relatively stable over time with climate ENMs indicating up to a 19% expansion in the amount of suitable habitat from late Pleistocene to the present. Presence of karst was a good predictor when used alone or when combined as karst-climate ENMs. Fossil evidence shows that some populations of mormoopids became extirpated as recently as 3.6 ka. These data, taken together with our conclusion that suitable climate habitat for mormoopid bats existed in the Caribbean beyond late Pleistocene to Holocene transition, suggest that these bats may have survived this climate change event by roosting outside their characteristic hot cave environment.',\n",
" u'dc.identifier.uri': u'http://hdl.handle.net/2246/6631',\n",
" u'dc.language.iso': u'en_US',\n",
" u'dc.publisher': u'American Museum of Natural History.',\n",
" u'dc.relation.ispartofseries': u'American Museum novitates;no.3847.',\n",
" u'dc.title': u'The importance of late Quaternary climate change and karst on distributions of Caribbean mormoopid bats. (American Museum novitates, no. 3847)',\n",
" 'subjects': [u'Antillean ghost-faced bat.',\n",
" u'Pteronotus parnellii.',\n",
" u'Pteronotus quadridens.',\n",
" u'Mormoopidae.',\n",
" u'Bats.',\n",
" u'Bats, Fossil.',\n",
" u'Climatic changes.',\n",
" u'Karst.',\n",
" u'Cave animals.',\n",
" u'Paleoclimatology.',\n",
" u'Paleobiogeography.',\n",
" u'Caribbean Area.'],\n",
" 'title.alternatives': [u'Climate change and Caribbean bats.']}"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"year = 2015\n",
"ids = functions.getIdsFromPeriod(year)\n",
"print len(ids), ids[0]\n",
"functions.getMetadata(ids[0])\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"try:\n",
" functions.convert_pdf_to_txt(6706)\n",
"except:\n",
" os.system('wget --no-check-certificate https://digitallibrary.amnh.org/rest/bitstreams/'+str(6706)+'/retrieve -O ' + str(6706) + '.txt')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
73 changes: 73 additions & 0 deletions functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,18 @@
import json
import csv
import pandas as pd
import glob
import path
import errno
import signal


#must install pdfminer separately (pip install is recommended)
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO

try:
import urllib2 as urllib
Expand Down Expand Up @@ -92,3 +104,64 @@ def getMetadata(itemId):
#'species':findSpecieInPDF(file.pdf)

return dic

def signal_handler(signum, frame):
raise Exception("Timed Out")

def convert_pdf_to_txt(itemId):
'''This function takes a single pdf file,
and converts it to a .txt file
'''
os.system('wget --no-check-certificate https://digitallibrary.amnh.org/rest/bitstreams/'+str(itemId)+'/retrieve -O ' + str(itemId) + '.pdf')

rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(str(itemId) + '.pdf', 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()

for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):

signal.signal(signal.SIGALRM,signal_handler)
signal.alarm(20)
try:
interpreter.process_page(page)
except Exception,msg:
pass


text = retstr.getvalue()

#Cleaning
fp.close()
device.close()
retstr.close()
os.system('rm '+ str(itemId) + '.pdf')

return text

def findSpecieInPDF(itemId):
'''Extract pdf given an ID, convert it to text,
and remove the local pdf.
'''

#download PDF
os.system('wget --no-check-certificate https://digitallibrary.amnh.org/rest/bitstreams/'+str(itemId)+'/retrieve -O ' + itemId + '.pdf')

#extract text
#convert_pdf_to_txt(itemId+'.pdf')

#remove pdf
#os.system('rm '+ itemId + '.pdf')

#apply Bens function
listOfSpecies = []

return listOfSpecies

Loading