-
Notifications
You must be signed in to change notification settings - Fork 9
/
wordnet-database-files.lisp
184 lines (157 loc) · 7.25 KB
/
wordnet-database-files.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
;;; CommonLisp interface to WordNet
;;; 1995, Mark Nahabedian
;;; Artificial Intelligence Laboratory
;;; Massachusetts Institute of Technology
;;; Code for reading WordNet "data" and "index" files as output by WordNet's
;;; "grind" command. These functions merely return the requested entry as a
;;; string. These are other functions which will parse those strings and extract
;;; the data.
;;; This file provides the functions INDEX-ENTRY-FOR-WORD and
;;; READ-DATA-FILE-ENTRY which are used to read raw data from WordNet's compiled
;;; database files. You must tell them which file to read from using the
;;; FILE-DESCRIPTION argument. You can pass a file stream (as opened by
;;; WITH-WORDNET-FILE-STREAM), a pathname or a part of speech keyword as the
;;; value for this argument.
;;; (INDEX-ENTRY-FOR-WORD file-description word)
;;; Looks up word in the appropriate index file and returns the string
;;; corresponding to that record of the index file.
;;; (READ-DATA-FILE-ENTRY file-description offset)
;;; Reads a WordNet "symset" record from the specified offset in the specified
;;; file. A string is returned. Offset was either read from an index record,
;;; or from a pointer description in another synset record.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(in-package #:wordnet)
(defparameter +wordnet-database-directory+
(asdf:system-relative-pathname '#:wordnet "dict/"))
(defparameter +wordnet-data-files+
(directory (make-pathname :name "data"
:type :wild
:defaults +wordnet-database-directory+)))
(defparameter +wordnet-index-files+
(directory (make-pathname :name "index"
:type :wild
:defaults +wordnet-database-directory+)))
(defparameter +wordnet-exception-files+
(directory (make-pathname :name :wild
:type "exc"
:defaults +wordnet-database-directory+)))
(defun file-for-part-of-speech (part-of-speech file-type)
(car (member (part-of-speech-file-type part-of-speech)
(ecase file-type
(:index +wordnet-index-files+)
(:data +wordnet-data-files+))
:test #'string-equal
:key #'pathname-type)))
(defun exception-file-for-part-of-speech (part-of-speech)
(car (member (part-of-speech-file-type part-of-speech)
+wordnet-exception-files+
:test #'string-equal
:key #'pathname-name)))
(defmacro with-wordnet-file-stream ((stream-var file-identifier) &body body)
`(with-open-file (,stream-var ,file-identifier :element-type 'character)
,@body))
(defun skip-wordnet-preamble (stream)
(let ((position 0) line)
(file-position stream 0)
(loop
(unless (string-equal " " (setq line (read-line stream))
:end2 2)
(return (values position line)))
(setq position (file-position stream)))))
(defgeneric read-data-file-entry (from-where file-offset)
(:documentation
"Return a string that represents the synset at OFFSET in the specified
WordNet data file"))
(defmethod read-data-file-entry ((pos symbol) file-offset)
(read-data-file-entry (file-for-part-of-speech pos :data) file-offset))
(defmethod read-data-file-entry ((pathname pathname) file-offset)
(with-wordnet-file-stream (stream pathname)
(read-data-file-entry stream file-offset)))
(defmethod read-data-file-entry ((data-file-stream t) file-offset)
(file-position data-file-stream file-offset)
(let ((raw-data (read-line data-file-stream)))
(unless (= (parse-integer raw-data :end (position #\space raw-data))
file-offset)
(error "The offset ~d does not point to a synset entry in the file ~s"
file-offset data-file-stream))
raw-data))
(defgeneric index-entry-for-word (from-where word)
(:documentation
"Return a string that is the entry for WORD in the specified WordNet index file"))
(defmethod index-entry-for-word ((pos null) word)
nil)
(defmethod index-entry-for-word ((pos symbol) word)
(index-entry-for-word (file-for-part-of-speech pos :index) word))
(defmethod index-entry-for-word ((pathname pathname) word)
(with-wordnet-file-stream (stream pathname)
(index-entry-for-word stream word)))
(defmethod index-entry-for-word ((index-file-stream t) word)
(let* ((bottom 0)
(top (file-length index-file-stream))
(word (string-downcase word))
(word-length (length word))
try where raw-entry)
(setq bottom (skip-wordnet-preamble index-file-stream))
(setq try (floor (+ bottom top) 2))
(loop
(multiple-value-setq (raw-entry where)
(find-index-entry-raw index-file-stream try))
;;(format t "~&~8d ~8d ~8d ~8d ~s" bottom top try where raw-entry)
(cond ((null raw-entry)
(return nil))
((and (< (length word) (length raw-entry))
(string-equal raw-entry word :end1 word-length)
(char-equal #\space (aref raw-entry word-length)))
;; no suffix, we found it
(return raw-entry))
((and (>= where top)
(= where bottom))
(return nil))
((>= where top)
;; The line that TRY points into is the last line of the current
;; search space. Start searching from BOTTOM.
(setq try bottom))
((string< raw-entry word)
;; beginning of next line
(setq bottom (+ where (length raw-entry) 1)
try (floor (+ bottom top) 2)))
((string>= raw-entry word)
;; end of previous line
(setq top (1- where)
try (floor (+ bottom top) 2)))))))
(defun find-index-entry-raw (index-file-stream file-offset)
(let ((file-offset file-offset))
(file-position index-file-stream (max 0 (1- file-offset)))
(unless (char-equal #\newline (read-char index-file-stream))
;; if not at a line boundary, skip to end of line
(read-line index-file-stream nil nil)
(setq file-offset (file-position index-file-stream)))
(values (read-line index-file-stream nil nil) file-offset)))
(defun list-index-file (file from to)
(with-wordnet-file-stream (stream file)
(let (line where)
(multiple-value-setq (where line)
(skip-wordnet-preamble file))
(loop
(unless (string< line from :end1 (min (length from) (length line)))
(return))
(setq line (read-line stream)))
(setq where (- (file-position stream) (length line) 1))
(loop
(when (string> line to :end1 (length to))
(return))
(format t "~&~8d~15t~s" where line)
(setq where (file-position stream))
(setq line (read-line stream))))))
#|
;;; Test cases for INDEX-ENTRY-FOR-WORD.
(index-entry-for-word :noun "cat")
(index-entry-for-word :noun "bed")
(index-entry-for-word :noun "0") ;it's in there
(index-entry-for-word :noun "'s_gravenhage") ;first noun
(index-entry-for-word :noun "foobar") ;not there, should return nil
(index-entry-for-word :noun "zymurgy") ;last noun
|#
(defmethod exception-entry-for-word ((pos symbol) word)
(with-wordnet-file-stream (stream (exception-file-for-part-of-speech pos))
(index-entry-for-word stream word)))