-
Notifications
You must be signed in to change notification settings - Fork 2
/
AnalyzeNLP.py
135 lines (104 loc) · 4.83 KB
/
AnalyzeNLP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import cortipy
import os
import mysql.connector as msc
import time
import sys
from textblob import TextBlob
import json
apiKey = os.environ.get('CORTICAL_API_KEY')
MYSQL_GSA_PASSWORD = os.environ.get('MYSQL_GSA_PASSWORD')
client = cortipy.CorticalClient(apiKey)
config = {
'user': 'root',
'password': MYSQL_GSA_PASSWORD,
'host': '130.211.154.93',
'database': 'test',
'charset': 'utf8'
}
#print "usage = python ClassifyVCs.py vctest4 (VC scraper) -or- python ClassifyVCs.py crunchbase_startups (startup capital scraper)"
dbtable = "crunchbase_startups"
dbtable = sys.argv[1]
#print "dbtable = " + dbtable
#Need to put this into an infinite loop
# while 1:
con = msc.connect(**config)
cur = con.cursor()
cur.execute("select siteurl, market, funding_total_usd, status, country_code, state_code, funding_rounds, cortical_io, watson, opencalais, cortical_io_keywords from "+dbtable+" where cortical_io is not null and text <> ''")
full = cur.fetchall()
#top 12 categories by funding raised in USD
categories = ("Biotechnology", "Communities", "Clean Technology", "Curated Web", "Consumer Electronics", "Advertising", "Analytics", "Batteries", "Clinical Trials", "Big Data", "Banking")
# build positive examples
i = 0
cur = con.cursor()
cur.execute("select siteurl, text from "+dbtable+" where cortical_io is not null and text <> '' and market = '"+categories[i]+"' and status='operating' and country_code = 'USA' and funding_rounds>0 ORDER BY RAND() limit 1;")
pos = cur.fetchall()
postext = []
for i in range(0, len(pos)):
postext.append(pos[i][1])
# build negative examples
cur = con.cursor()
cur.execute("select siteurl, text from "+dbtable+" where cortical_io is not null and text <> '' and market <> '"+categories[i]+"' and status='operating' and country_code = 'USA' and funding_rounds>0 ORDER BY RAND() limit 1;")
neg = cur.fetchall()
negtext = []
for i in range(0, len(neg)):
negtext.append(neg[i][1])
#build classifier based on pos text
CBCategoryClassifier = client.createClassification("test", postext, "")
# Chcek Term similarity
#unseenTermBitmap = client.getBitmap(categories[i])['fingerprint']['positions']
for i in range(0, len(categories)):
unseenTermBitmap = client.getTextBitmap(categories[i])['fingerprint']['positions']
distances = client.compare(unseenTermBitmap, CBCategoryClassifier['positions'])
print categories[i] + " " + str(distances['euclideanDistance'])
pos[0][0]
print distances['euclideanDistance']
#check new copy
unseenBitmap = client.getTextBitmap("The Zen of Python >>>import this")['fingerprint']['positions']
distances = client.compare(unseenBitmap, CBCategoryClassifier['positions'])
print distances['euclideanDistance']
for i in range(0, len(full)):
siteurl[i] = str(full[i][0])
market[i] = str(full[i][1])
funding[i] = str(full[i][2])
status[i] = str(full[i][3])
country[i] = str(full[i][4])
state[i] = str(full[i][5])
funding[i] = str(full[i][6])
cortical_io[i] = json.loads(full[i][7])
watson[i] = str(full[i][8])
opencalais[i] = str(full[i][9])
keywords[i] = str(full[i][10])
#Cortical.io
termKeyWords = client.extractKeywords(text)
termBitmap = client.getTextBitmap(text)['fingerprint']['positions']
#TextBlob
blob = TextBlob(text)
MySqlKeyWordDat = (','.join(termKeyWords), siteurl)
MySqlBitMapDat = (str(termBitmap), siteurl)
MySqlTextBlobDat = (str(blob.sentiment), siteurl)
MySqLangDat = (str(blob.detect_language()), siteurl)
print "---For "+siteurl+" keywords = " + ",".join(termKeyWords) + " sentiment = " + MySqlTextBlobDat[0] + " lang:" + MySqLangDat[0]
MySqlKeyWordDatQ = """UPDATE """+dbtable+""" SET cortical_io_keywords = %s WHERE siteurl = %s"""
MySqlBitMapDatQ = """UPDATE """+dbtable+""" SET cortical_io = %s WHERE siteurl = %s"""
MySqBlobDatQ = """UPDATE """+dbtable+""" SET opencalais = %s WHERE siteurl = %s"""
MySqLangDatQ = """UPDATE """+dbtable+""" SET watson = %s WHERE siteurl = %s"""
#upload keywords and bitmap to database
cur.execute(MySqlKeyWordDatQ, MySqlKeyWordDat)
cur.execute(MySqlBitMapDatQ, MySqlBitMapDat)
cur.execute(MySqBlobDatQ, MySqlTextBlobDat)
cur.execute(MySqLangDatQ, MySqLangDat)
con.commit()
#programmingCategory = client.createClassification(categoryName, pos, neg)
#if(text == ''):
# continue
con.close()
#bitmapTerms = client.bitmapToTerms(termBitmap['fingerprint'])
###### sample code bleow.
# Evaluate how close a new term is to the category.
#termBitmap = client.getBitmap("Python")['fingerprint']['positions']
#distances = client.compare(termBitmap, programmingCategory['positions'])
#print distances['euclideanDistance']
# Try a block of text.
#textBitmap = client.getTextBitmap("The Zen of Python >>>import this")['fingerprint']['positions']
#distances = client.compare(textBitmap, programmingCategory['positions'])
#print distances['euclideanDistance']