-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathphenotype.py
294 lines (253 loc) · 12.5 KB
/
phenotype.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import spacy
import re
from nltk.tree import Tree
from nltk.chunk.util import tagstr2tree
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, regexp_tokenize
import nltk
import pandas as pd
from spacy.matcher import PhraseMatcher
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nlp = spacy.load('en_core_web_sm')
# Define a dictionary of characteristics and their synonyms
characteristics = {
'tree health': ['th_good', 'th_poor', 'th_very good'],
'fruit yield': ['fy_heavy', 'fy_moderate', 'fy_light'],
'fruit size': ['fs_large', 'fs_medium', 'fs_small'],
'fruit shape': ['fsh_oblate', 'fsh_elongated', 'fsh_necked', 'fsh_spheroid'],
'fruit color': ['fc_green', 'fc_color break', 'fc_orange'],
'harvest': ['hv_plugged', 'hv_clean'],
'peel': ['p_easy to peel', 'p_not peelable'],
'peel thickness': ['pt_thick peel', 'pt_thick flavedo', 'pt_thick albedo'],
'fruit flesh': ['ff_juicy', 'ff_dry', 'ff_segments separate easily'],
'seed number': ['sn_high', 'sn_moderate', 'sn_low', 'sn_none'],
'brix': ['b_high', 'b_moderate', 'b_low'],
'acid': ['a_high', 'a_moderate', 'a_low'],
'fruit flavor': ['ff_best', 'ff_above average', 'ff_average', 'ff_below average'],
'trifoliate aftertaste': ['ta_trifoliate aftertaste', 'ta_present', 'ta_not present'],
'market class': ['mc_grapefruit like', 'mc_orange like', 'mc_mandarin like'],
'recheck': ['rc_recheck', 'rc_yes', 'rc_no']
}
# Define the features and their associated keywords
features = {
'health': ['good', 'poor', 'very good'],
'yield': ['heavy', 'moderate', 'light'],
'size': ['large', 'medium', 'small'],
'shape': ['oblate', 'elongated', 'necked', 'spheroid'],
'color': ['green', 'color break', 'orange'],
'harvest': ['plugged', 'clean'],
'peel': ['easy to peel', 'not peelable'],
'peel thickness': ['thick peel', 'thick flavedo', 'thick albedo'],
'fruit flesh': ['juicy', 'dry', 'segments separate easily'],
'seed number': ['high', 'moderate', 'low', 'none'],
'brix': ['high', 'moderate', 'low'],
'acid': ['high', 'moderate', 'low'],
'fruit flavor': ['best', 'above average', 'average', 'below average'],
'trifoliate aftertaste': ['trifoliate aftertaste', 'trifolate aftertaste', 'present', 'not present'],
'market class': ['grapefruit like', 'orange like', 'mandarin like'],
'recheck': ['recheck', 'yes', 'no']
}
features_okra = {
'stem': ['red green'],
'flowering': ['not flowering', 'one pod']
}
def extract_features_matcher(text):
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
for characteristic, synonyms in characteristics.items():
patterns = [nlp.make_doc(syn.lower()) for syn in synonyms]
matcher.add(characteristic, patterns)
doc = nlp(text)
# Initialize an empty dictionary to store the extracted characteristics
fruit_characteristics = {}
# Find all matches in the doc and add them to the dictionary
for match_id, start, end in matcher(doc):
characteristic = nlp.vocab.strings[match_id]
value = doc[start:end].text
fruit_characteristics[characteristic] = value
# Return the dictionary of extracted characteristics
return pd.Series(fruit_characteristics)
def extract_features(text):
# Use NER to extract named entities
doc = nlp(text)
named_entities = set([ent.text.lower()
for ent in doc.ents if ent.label_ == "PRODUCT"])
# Use POS tagging to extract adjectives
adjectives = set([token.text.lower()
for token in doc if token.pos_ == "ADJ"])
# Use keyword matching to extract features
extracted_features = {}
for feature, keywords in features_okra.items():
for keyword in keywords:
pattern = r'\b{}\b'.format(keyword)
matches = re.findall(pattern, text, re.IGNORECASE)
if matches:
extracted_features[feature] = set(matches)
# Check if the feature was mentioned as a named entity or adjective
if feature not in extracted_features:
if any(keyword in named_entities for keyword in keywords):
extracted_features[feature] = list(
named_entities.intersection(keywords))[0]
elif any(keyword in adjectives for keyword in keywords):
extracted_features[feature] = list(
adjectives.intersection(keywords))[0]
# return pd.Series(extracted_features)
return extracted_features
def context_phenotype_text(text, context_size=0):
# Tokenize the text into individual words
tokens = word_tokenize(text)
# Use part-of-speech (POS) tagging to identify the parts of speech of each word
pos_tags = nltk.pos_tag(tokens)
# Extract the characteristics of the fruit
fruit_characteristics = {}
for characteristic, synonyms in characteristics.items():
extracted_synonyms = []
for i in range(len(tokens)):
word = tokens[i]
for syn in synonyms:
if word.lower() == syn.split("_")[1]:
extracted_synonyms.append(syn)
# Check the surrounding words in the n-gram
start_index = max(i - context_size, 0)
end_index = min(i + context_size + 1, len(tokens))
ngram = tokens[start_index:end_index]
for ngram_word, tag in nltk.pos_tag(ngram):
if ngram_word.lower() in synonyms:
extracted_synonyms.append(ngram_word.lower())
elif wordnet.synsets(word):
word_synonyms = []
for synset in wordnet.synsets(word):
for lemma in synset.lemmas():
word_synonyms.extend([synonym.lower() for synonym in lemma.name().split(
'_') if synonym.lower() in synonyms])
if word_synonyms:
# Check the surrounding words in the n-gram
start_index = max(i - context_size, 0)
end_index = min(i + context_size + 1, len(tokens))
ngram = tokens[start_index:end_index]
for ngram_word, tag in nltk.pos_tag(ngram):
if ngram_word.lower() in word_synonyms:
extracted_synonyms.append(ngram_word.lower())
if extracted_synonyms:
fruit_characteristics[characteristic] = extracted_synonyms
if fruit_characteristics:
for key in fruit_characteristics.keys():
fruit_characteristics[key] = list(set(fruit_characteristics[key]))
return pd.Series(fruit_characteristics)
# for characteristic, synonyms in fruit_characteristics.items():
# print(f"The {characteristic} of the fruit is {', '.join(synonyms)}")
else:
# print("No characteristics found.")
return None
def reg_phenotype_text(text):
# Tokenize the text into individual words
reg_tokens = regexp_tokenize(text, pattern=r"(\w+\s\w+|\w+)")
tokens = word_tokenize(text)
# Use part-of-speech (POS) tagging to identify the parts of speech of each word
pos_tags = nltk.pos_tag(tokens)
reg_pos_tags = nltk.pos_tag(reg_tokens)
# pos_tags = pos_tags + reg_pos_tags
pos_tags = reg_pos_tags
with open("pos_tag_regex.txt", "w") as f:
f.write(str(pos_tags))
# entities = nltk.chunk.ne_chunk(pos_tags)
# tree = Tree(tagstr2tree(pos_tags))
# print(tree.pretty_print())
# print(entities)
# Extract the characteristics of the fruit
fruit_characteristics = {}
for characteristic, synonyms in characteristics.items():
extracted_synonyms = []
for word, tag in pos_tags:
# print(word, tag, [(s.name(), s.lemmas())
# for s in wordnet.synsets(word)])
for syn in synonyms:
if word.lower() == syn.split("_")[1]:
extracted_synonyms.append(syn)
elif wordnet.synsets(word):
# word_synonyms = [synonym.lower() for synonym in wordnet.synset(
# wordnet.synsets(word)[0].name()).lemma_names() if synonym.lower()]
# if word_synonyms:
# extracted_synonyms.extend(word_synonyms)
# word_synonyms = [s for s in synonyms if wordnet.synsets(word) and s.endswith(wordnet.synset(
# wordnet.synsets(word)[0].name()).lemmas()[0].name())]
word_synonyms = []
for s in synonyms:
if wordnet.synsets(word) and s.endswith(wordnet.synset(wordnet.synsets(word)[0].name()).lemmas()[0].name()):
word_synonyms.append(s)
extracted_synonyms.extend(word_synonyms)
if extracted_synonyms:
fruit_characteristics[characteristic] = extracted_synonyms
# Print the extracted characteristics
if fruit_characteristics:
for key in fruit_characteristics.keys():
fruit_characteristics[key] = list(set(fruit_characteristics[key]))
return pd.Series(fruit_characteristics)
# for characteristic, synonyms in fruit_characteristics.items():
# print(f"The {characteristic} of the fruit is {', '.join(synonyms)}")
else:
# print("No characteristics found.")
return None
def phenotype_text(text):
# Tokenize the text into individual words
tokens = word_tokenize(text)
# Use part-of-speech (POS) tagging to identify the parts of speech of each word
pos_tags = nltk.pos_tag(tokens)
# entities = nltk.chunk.ne_chunk(pos_tags)
# tree = Tree(tagstr2tree(pos_tags))
# print(tree.pretty_print())
# print(entities)
# Extract the characteristics of the fruit
fruit_characteristics = {}
for characteristic, synonyms in characteristics.items():
extracted_synonyms = []
for word, tag in pos_tags:
# print(word, tag, wordnet.synsets(word))
for syn in synonyms:
if word.lower() == syn.split("_")[1]:
extracted_synonyms.append(syn)
elif wordnet.synsets(word):
# word_synonyms = [synonym.lower() for synonym in wordnet.synset(
# wordnet.synsets(word)[0].name()).lemma_names() if synonym.lower()]
# if word_synonyms:
# extracted_synonyms.extend(word_synonyms)
word_synonyms = [s for s in synonyms if wordnet.synsets(word) and s.endswith(wordnet.synset(
wordnet.synsets(word)[0].name()).lemmas()[0].name())]
extracted_synonyms.extend(word_synonyms)
if extracted_synonyms:
# Check for default synonym "average" for "fruit flavor"
if characteristic == "fruit flavor":
for syn in extracted_synonyms:
if syn.startswith(("above", "below", "good", "poor")):
extracted_synonyms.remove(syn)
extracted_synonyms.append("ff_average")
fruit_characteristics[characteristic] = extracted_synonyms
# Print the extracted characteristics
if fruit_characteristics:
for key in fruit_characteristics.keys():
fruit_characteristics[key] = list(set(fruit_characteristics[key]))
return pd.Series(fruit_characteristics)
else:
# print("No characteristics found.")
return None
# sample = "thick peel Harvest clean fruit flesh juicy fruit size is medium average fruit flavor with moderate brix moderate acid High seed number trifoliate aftertaste come back and recheck"
sample = "new plant red green stem one pod new plant one pods new plant not flowering new plant not flowering new plant not flowering new plant one pod new plant not flowering"
# df = pd.read_csv("test_log_with_transcript.csv")
# processed_df = pd.concat(
# [df, df['transcript'].apply(extract_features)], axis=1)
# processed_df.to_csv("phenotype_characteristics.csv", index=False)
# print(extract_features(sample))
for part in sample.split("new plant"):
print(part)
print(extract_features(part))
# print("=========BASE===========")
# print(phenotype_text(sample))
# print("=========REGEX===========")
# print(reg_phenotype_text(sample))
# print("=========CONTEXT===========")
# print(context_phenotype_text(sample, 2))
# print("=========SPACY===========")
# print(extract_features(sample))