-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleanup-data.scm
299 lines (262 loc) · 9.13 KB
/
cleanup-data.scm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
;
; cleanup-data.scm
;
; The raw data is a tad messy and problematic. The functions here fix
; assorted issues.
;
(use-modules (opencog exec))
(use-modules (srfi srfi-1))
; --------------------
; Performance stats timer
(define (make-timer)
(let ((start-time (get-internal-real-time)))
(lambda ()
(define now (get-internal-real-time))
(define diff (/ (- now start-time) internal-time-units-per-second))
(set! start-time now)
diff)))
; --------------------
(define (delete-go-nodes)
"
Delete the GO (GeneOnotology) nodes, as they are not pathways.
They way they are encoded interferes with pathway nodes.
"
(for-each
(lambda (cpt)
(if (string-contains (cog-name cpt) "GO:")
(cog-delete-recursive cpt)))
(cog-get-atoms 'ConceptNode)))
; --------------------
(define (delete-self-interaction)
"
Many genes are marked as interacting with themselves.
Delete these, they screw up the topology of the searches.
"
(define selfie-q
(Get (List (Variable "$x") (Variable "$x"))))
(define selfie-set (cog-execute! selfie-q))
(define selfies (cog-outgoing-set selfie-set))
(cog-delete selfie-set)
(format #t "Deleting ~A self-interactions\n" (length selfies))
(for-each
(lambda (gene) (cog-delete-recursive (List gene gene)))
selfies))
; --------------------
(define (delete-bad-chebi)
"
Delete (MoleculeNode \"ChEBI:nan\") and all links that contain it.
This is not a valid protein. Also delete some other junk.
"
(cog-delete-recursive (MoleculeNode "ChEBI:nan"))
(cog-delete-recursive (ConceptNode "SMPD1 "))
(cog-delete-recursive (ConceptNode "SMPD2 "))
(cog-delete-recursive (ConceptNode "SMPD3 "))
(cog-delete-recursive (ConceptNode "SMPD4 "))
*unspecified*
)
; --------------------
(define (count-gene-interactions)
"
Count the number of symmetric and non-symmetric gene-pair interactions
in the dataset. The gene interactions use the asymmetric ListLink to
denote gene pairs. This counts the total number of interactions, the
number of paired and unpaired interactions, and the number of
self-interactions.
"
; Total number of interactions
(define count-q
(Get
(VariableList
(TypedVariable (Variable "g1") (Type 'GeneNode))
(TypedVariable (Variable "g2") (Type 'GeneNode)))
(Evaluation (Predicate "interacts_with")
(List (Variable "g1") (Variable "g2")))))
; Symmetric interactions
(define sym-q
(Get
(VariableList
(TypedVariable (Variable "g1") (Type 'GeneNode))
(TypedVariable (Variable "g2") (Type 'GeneNode)))
(And
(Evaluation (Predicate "interacts_with")
(List (Variable "g1") (Variable "g2")))
(Evaluation (Predicate "interacts_with")
(List (Variable "g2") (Variable "g1"))))))
; Self-interactions
(define self-q
(Get
(TypedVariable (Variable "g1") (Type 'GeneNode))
(Evaluation (Predicate "interacts_with")
(List (Variable "g1") (Variable "g1")))))
(define cnt-set (cog-execute! count-q))
(define n-acts (length (cog-outgoing-set cnt-set)))
(cog-delete cnt-set)
(format #t "Found ~A gene interactions\n" n-acts)
(define sym-set (cog-execute! sym-q))
(define n-sym (length (cog-outgoing-set sym-set)))
(cog-delete sym-set)
(format #t "Found ~A symmetric (paired) gene interactions\n" n-sym)
(define self-set (cog-execute! self-q))
(define n-self (length (cog-outgoing-set self-set)))
(cog-delete self-set)
(format #t "Found ~A self-interacting genes\n" n-self)
(cog-delete-recursive (Variable "g1"))
(cog-delete-recursive (Variable "g2"))
(define n-asym (- n-acts n-sym))
(format #t "Conclude: there are ~A asymmetric interctions\n" n-asym)
(define n-edge (/ (- (+ n-acts n-asym) n-self) 2))
(format #t "Conclude: there are ~A symmetrized interactions\n" n-edge)
*unspecified*
)
; --------------------
(define (symmetrize-gene-interactions)
"
The gene interactions use the asymmetric ListLink to denote
gene pairs. But gene interactions are (meant to be) symmetric, so
they should have used the SetLink. Oh well. At this time, it is
convenient to use the ListLink during pattern searches; but in this
case, the interactions should be symmetrized. That's what this does.
"
(define interact-q
(Query
(VariableList
(TypedVariable (Variable "g1") (Type 'GeneNode))
(TypedVariable (Variable "g2") (Type 'GeneNode)))
(Present
(Evaluation (Predicate "interacts_with")
(List (Variable "g1") (Variable "g2"))))
(Evaluation (Predicate "interacts_with")
(List (Variable "g2") (Variable "g1")))))
(define sym-set (cog-execute! interact-q))
(format #t "Found ~A gene interactions\n"
(length (cog-value->list sym-set)))
(cog-delete interact-q)
*unspecified*
)
; --------------------
(define (make-gene-pairs)
"
The gene interactions use the asymmetric ListLink to denote
gene pairs. But gene interactions are (meant to be) symmetric, so
they should have used the SetLink. We make this now.
See also: make-triangles, make-tetrahedra
"
(define make-sym-pairs
(Query
(VariableList
(TypedVariable (Variable "g1") (Type 'GeneNode))
(TypedVariable (Variable "g2") (Type 'GeneNode)))
(Present
(Evaluation (Predicate "interacts_with")
(List (Variable "g1") (Variable "g2"))))
(Evaluation (Predicate "gene-pair")
(Set (Variable "g1") (Variable "g2")))))
(define elapsed-secs (make-timer))
(define sym-set (cog-execute! make-sym-pairs))
(format #t "Created ~A symmetric gene-pairs in ~6f seconds\n"
(length (cog-value->list sym-set)) (elapsed-secs))
(cog-delete make-sym-pairs)
*unspecified*
)
; --------------------
(define (delete-all-but-interactions)
"
Remove pretty much everything that is not a gene or a protein
interacting with one-another.
The goal is to reduce the atomspace to something manageable and more
responsive during gene graph searches.
This assumes that `(make-gene-pairs)` has alreay run.
"
; What about the first two?
(cog-delete-recursive (PredicateNode "transcribed_to"))
(cog-delete-recursive (PredicateNode "translated_to"))
(cog-delete-recursive (PredicateNode "has_location"))
(cog-delete-recursive (PredicateNode "has_name"))
(cog-delete-recursive (PredicateNode "has_pubmedID"))
(cog-delete-recursive (PredicateNode "GO_name"))
(cog-delete-recursive (PredicateNode "has_biogridID"))
(cog-delete-recursive (PredicateNode "GO_namespace"))
(cog-delete-recursive (PredicateNode "has_entrez_id"))
(cog-delete-recursive (PredicateNode "has_current_symbol"))
; The symmetrized gene-pair has rendered this useless.
(cog-delete-recursive (PredicateNode "interacts_with"))
; The pentagons depend on interacting genes, so kill all genes
; that aren't in some gene-pair.
(define all-genes (cog-get-atoms 'GeneNode))
(define interacting-genes
(filter
(lambda (gene) (< 0 (cog-incoming-size-by-type gene 'Set)))
all-genes))
(for-each cog-delete-recursive
(atoms-subtract all-genes interacting-genes))
; The above will orphan many ListLinks. Delete them.
(for-each cog-delete
(filter
(lambda (lst) (= 0 (cog-incoming-size lst)))
(cog-get-atoms 'List)))
; We are not looking at InheritanceLinks for anything
(for-each
(lambda (misc)
(if (= 0 (cog-incoming-size misc)) (cog-delete misc)))
(cog-get-atoms 'Inheritance))
; Once the ListLinks are gone, then orphan nodes show up.
(for-each
(lambda (misc)
(if (= 0 (cog-incoming-size misc)) (cog-delete misc)))
(cog-get-atoms 'Concept))
(for-each
(lambda (misc)
(if (= 0 (cog-incoming-size misc)) (cog-delete misc)))
(cog-get-atoms 'Molecule))
(for-each
(lambda (gene)
(if (= 0 (cog-incoming-size gene)) (cog-delete gene)))
(cog-get-atoms 'Gene))
)
; --------------------
(define (delete-all-but-gene-interactions)
"
Remove pretty much everything that is not a gene that belongs to
a gene-pair. The goal is to reduce the atomspace to something
manageable and more responsive during gene graph searches.
This assumes that `(make-gene-pairs)` has alreay run.
"
(define elapsed-secs (make-timer))
(for-each cog-delete-recursive (cog-get-atoms 'Molecule))
(for-each cog-delete-recursive (cog-get-atoms 'Concept))
(for-each cog-delete-recursive (cog-get-atoms 'List))
(for-each cog-delete-recursive (cog-get-atoms 'Variable))
(for-each
(lambda (pred)
(if (not (equal? pred (Predicate "gene-pair")))
(cog-delete-recursive pred)))
(cog-get-atoms 'Predicate))
(for-each
(lambda (gene)
(if (= 0 (cog-incoming-size gene)) (cog-delete gene)))
(cog-get-atoms 'Gene))
(format #t "Cleaned out non-genomic data ~6f seconds\n" (elapsed-secs))
(format #t "What's left: ~A\n" (cog-report-counts))
*unspecified*
)
; --------------------
(define (delete-simple-tv)
"
Delete the SimpleTruthValues on all atoms in the atomspace.
The problem is that calling `get-count` on a SimpleTruthValue
returns garbage, thus messing up statistics. Unfortunately,
this cannot be fixed, because the PLN book documents the garbage;
its part of the spec. Whoops.
"
(define elapsed-secs (make-timer))
; Setting to (stv 1 0) sets it to DEFAULT_TV, which frees
; the RAM in the AtomSpace.
(for-each
(lambda (ATOM)
(if (not (cog-ctv? (cog-tv ATOM)))
(cog-set-tv! ATOM (stv 1 0))))
(cog-get-atoms 'Atom #t))
(format #t "Removed SimpleTV in ~6f seconds\n" (elapsed-secs))
*unspecified*
)
; --------------------