forked from nikhgarg/scholar.py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ieee_map.py
116 lines (95 loc) · 4.19 KB
/
ieee_map.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 11 23:30:19 2014
@author: Nikhil
"""
from selenium_test import GetHTMLSearchIEEEByName
from selenium_test import GetHTMLFromLink
from selenium_test import GetSearchLinkFromArticleName
from Article import Article
from Article import parseIdentificationFromLink
from database import AppendDatabaseFromMap
from database import LoadMapFromDatabase
from References import GetReferencesFromHTML
from References import GetTitleFromRef
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0
from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0
from selenium.webdriver.common.by import By
from HTMLParser import HTMLParser
from bs4 import BeautifulSoup
global_identification_value = 0
#driver = webdriver.Firefox()
driver = webdriver.PhantomJS()
def GetIdentificationNumber():
global global_identification_value
ret = global_identification_value
global_identification_value += 1
return ret
"""
returns list of Articles that can be recursively searched for references
this method can be recursively called with members of the array it returns
"""
def GetReferenceList(seedArticle, databaseFile = None, graphFile = None):
global global_identification_value
seedArticle.link = seedArticle.link.replace('articleDetails', 'abstractReferences')
html = GetHTMLFromLink(driver, seedArticle.link)
references = GetReferencesFromHTML(html)
articleList = []
for ref in references:
try :
article = Article()
article.title = GetTitleFromRef(ref)
html = GetHTMLSearchIEEEByName(webdriver.Firefox(), article.title)
article.link = GetSearchLinkFromArticleName(html, article.title)
article.identification = parseIdentificationFromLink(article.link)
articleList.append(article)
# if (databaseFile is not None and graphFile is not None):
# AppendDatabaseFromMap([article], databaseFile, graphFile)
print article.identification
print article.title
print "\n"
except :
continue
return articleList
def StartFromSeed(seedLink, seedTitle):
SeedArticle = Article()
SeedArticle.link = seedLink
SeedArticle.title = seedTitle
SeedArticle.identification = parseIdentificationFromLink(SeedArticle.link)
databasefile = "database.csv"
graphfile = "graph.csv"
SeedArticle.references = GetReferenceList(SeedArticle, databaseFile = databasefile, graphFile = graphfile)
mapToInsert = {}
mapToInsert[SeedArticle.identification] = SeedArticle
for art in SeedArticle.references:
mapToInsert[art.identification] = art
AppendDatabaseFromMap(mapToInsert, databasefile, graphfile)
print 'done'
def StartFromDatabaseAndGraph(databaseFileOrig, graphFileOrig, databaseFileNew, graphFileNew):
articlesDict = LoadMapFromDatabase(databaseFileOrig, graphFileOrig) #load into a dictonary <id: Article>
articlesDictAdded = {}
#for each Article in which len(references) == 0
#Get references as above, update article, add refrences to the map
for idNum in articlesDict:
try:
article = articlesDict[idNum]
if len(article.references) == 0:
article.references = GetReferenceList(article)
try:
for art in article.references:
if not articlesDict.has_key(art.identification):
articlesDictAdded[art.identification] = art
except Exception, e:
print "\tarticle failed : " + str(art)
print e
print str(idNum) + ' succeeded: refs' + str(len(article.references))
else:
print str(idNum) + ' skipping: already has references'
except Exception, e:
print str(idNum) + ' failed'
print e
articlesDict.update(articlesDictAdded)
AppendDatabaseFromMap(articlesDict, databaseFileNew, graphFileNew)
print 'done'