forked from CentreForCorpusResearch/clic-annotation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentences.py
121 lines (109 loc) · 4.83 KB
/
sentences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import sys
import re
from lxml import etree
class SentenceTokenizer(object):
def __init__(self, text):
"""
:param text: a string that one wants to analyze
"""
self.text = text #TODO unicode(text)?
self.tree = etree.fromstring(self.text)
# Regexes
# (?:[\.!?:]|--) changed to (?:[\.!?:]|--|$) by Cat 11/10/08
# - any second sentence not with final punctuation was getting lost and not printed
# - text not good enough to trust the punctuation to be there!
# 18/02/11 Cat removed -- from (?:[\.!?:]|--|$) as -- does not seem to indicate the end of a sentence
# in the majority of cases and it doesn't seem to mean end of sentence in punctuation literature
# may need to be put back in depending on Michaela's thoughts
# regex = '.+?(?<!\.\.)(?:[\.!?:]|--)["\'\)]{0,2}(?=\s+|$)(?!\s*[a-z])'
self.sentence_regex = re.compile("""
.+? #
(?<!\.\.) # disregard comments?
(?:[\.!?:]|$) #
["\'\)]{0,2} #
(?=\s+|$) #
(?!\s*[a-z]) #
""", re.VERBOSE) #
self.abbreviation_regex = re.compile(
'(^|\s|["\'-])([^\s]+?\.[a-zA-Z]+|Prof|Dr|Sr|Mr|Mrs|Ms|Jr|Capt|Gen|Col|Sgt|No|[ivxjCcl]+|[A-HJ-Z])\.(\s|$)')
self.paragraph_regex = re.compile('\n\n+')
def split_paragraph_into_sentences(self, text):
"""
Output looks like:
[['<div0 id="bh" type="book" filename="bh.txt">'], ['<stru>BLEAK HOUSE'], ['by Charles Dickens</stru>'], ...]
"""
paragraphs = self.paragraph_regex.split(text)
paragraph_list = []
for paragraph in paragraphs:
sentences = []
s = self.abbreviation_regex.sub('\\1\\2.\\3', paragraph)
sl = self.sentence_regex.findall(s)
if not sl:
# s += '.'
# sl = self.sentence_regex.findall(s)
sl = [s]
sentences.extend(sl)
new_sentences = []
for s in sentences:
new_sentences.append(s.replace(".", '.'))
paragraph_list.append(new_sentences)
return paragraph_list
def add_tags(self, text):
"""
Output looks like:
<s><div0 id="bh" type="book" filename="bh.txt"></s> <s><stru>BLEAK HOUSE</s> <s>by Charles Dickens</stru></s>
<s><cont>CONTENTS</s> <s>Preface</s> <s>I.</s> <s>In Chancery</s> <s>II.</s> <s>In Fashion</s> <s>III.</s>
<s>A Progress</s> <s></cont></s> <s><stru>PREFACE</s>
<s>Something.</stru></s> <s><title>CHAPTER I In Chancery</title></s>
"""
a_str = ''
paragraph_list = self.split_paragraph_into_sentences(text)
for paragraph in paragraph_list:
for sentence in paragraph:
a_str += '<s>%s</s> ' % sentence.strip()
return a_str
def update_tree(self):
"""
This updates the original self.tree.
"""
for p in self.tree.xpath('//p'):
paragraph_text = p.text
tokenized = self.add_tags(paragraph_text)
# uncomment the following line to get pure examples of sentence tokenization
# print tokenized
if not (tokenized.find('&') == -1):
tokenized = tokenized.replace('&', '&')
else:
if not (tokenized.find('&') == -1):
regex = re.compile('&(?!#[0-9]+;)')
tokenized = regex.sub('&', tokenized)
tokenized = tokenized.lstrip()
nodetree = etree.fromstring('<foo>%s</foo>' % tokenized)
p.text = None
for n in nodetree:
p.append(n)
def add_sentence_ids(self):
"""
"""
for chapter in self.tree.xpath('//div'):
book = chapter.get('book')
num = chapter.get('num')
scount = 1
for s in chapter.xpath('p/s'):
s.set('sid', str(scount))
s.set('id', book + '.c' + num + '.s' + str(scount))
scount += 1
def tokenize(self):
"""
Printing the output because the bash scripts uses the printed output.
"""
self.update_tree()
self.add_sentence_ids()
printable_tree = etree.tostring(self.tree)
print printable_tree
return printable_tree
if __name__ == "__main__":
with open(sys.argv[1], 'r') as a_file:
a_text = a_file.read()
tokenizer = SentenceTokenizer(a_text)
tokenizer.tokenize()