-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathsentence-generator.py
executable file
·137 lines (121 loc) · 4.44 KB
/
sentence-generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/python
import re
import random
import sys
# These mappings can get fairly large -- they're stored globally to
# save copying time.
# (tuple of words) -> {dict: word -> number of times the word appears following the tuple}
# Example entry:
# ('eyes', 'turned') => {'to': 2.0, 'from': 1.0}
# Used briefly while first constructing the normalized mapping
tempMapping = {}
# (tuple of words) -> {dict: word -> *normalized* number of times the word appears following the tuple}
# Example entry:
# ('eyes', 'turned') => {'to': 0.66666666, 'from': 0.33333333}
mapping = {}
# Contains the set of words that can start sentences
starts = []
# We want to be able to compare words independent of their capitalization.
def fixCaps(word):
# Ex: "FOO" -> "foo"
if word.isupper() and word != "I":
word = word.lower()
# Ex: "LaTeX" => "Latex"
elif word [0].isupper():
word = word.lower().capitalize()
# Ex: "wOOt" -> "woot"
else:
word = word.lower()
return word
# Tuples can be hashed; lists can't. We need hashable values for dict keys.
# This looks like a hack (and it is, a little) but in practice it doesn't
# affect processing time too negatively.
def toHashKey(lst):
return tuple(lst)
# Returns the contents of the file, split into a list of words and
# (some) punctuation.
def wordlist(filename):
f = open(filename, 'r')
wordlist = [fixCaps(w) for w in re.findall(r"[\w']+|[.,!?;]", f.read())]
f.close()
return wordlist
# Self-explanatory -- adds "word" to the "tempMapping" dict under "history".
# tempMapping (and mapping) both match each word to a list of possible next
# words.
# Given history = ["the", "rain", "in"] and word = "Spain", we add "Spain" to
# the entries for ["the", "rain", "in"], ["rain", "in"], and ["in"].
def addItemToTempMapping(history, word):
global tempMapping
while len(history) > 0:
first = toHashKey(history)
if first in tempMapping:
if word in tempMapping[first]:
tempMapping[first][word] += 1.0
else:
tempMapping[first][word] = 1.0
else:
tempMapping[first] = {}
tempMapping[first][word] = 1.0
history = history[1:]
# Building and normalizing the mapping.
def buildMapping(wordlist, markovLength):
global tempMapping
starts.append(wordlist [0])
for i in range(1, len(wordlist) - 1):
if i <= markovLength:
history = wordlist[: i + 1]
else:
history = wordlist[i - markovLength + 1 : i + 1]
follow = wordlist[i + 1]
# if the last elt was a period, add the next word to the start list
if history[-1] == "." and follow not in ".,!?;":
starts.append(follow)
addItemToTempMapping(history, follow)
# Normalize the values in tempMapping, put them into mapping
for first, followset in tempMapping.iteritems():
total = sum(followset.values())
# Normalizing here:
mapping[first] = dict([(k, v / total) for k, v in followset.iteritems()])
# Returns the next word in the sentence (chosen randomly),
# given the previous ones.
def next(prevList):
sum = 0.0
retval = ""
index = random.random()
# Shorten prevList until it's in mapping
while toHashKey(prevList) not in mapping:
prevList.pop(0)
# Get a random word from the mapping, given prevList
for k, v in mapping[toHashKey(prevList)].iteritems():
sum += v
if sum >= index and retval == "":
retval = k
return retval
def genSentence(markovLength):
# Start with a random "starting word"
curr = random.choice(starts)
sent = curr.capitalize()
prevList = [curr]
# Keep adding words until we hit a period
while (curr not in "."):
curr = next(prevList)
prevList.append(curr)
# if the prevList has gotten too long, trim it
if len(prevList) > markovLength:
prevList.pop(0)
if (curr not in ".,!?;"):
sent += " " # Add spaces between words (but not punctuation)
sent += curr
return sent
def main():
if len(sys.argv) < 2:
sys.stderr.write('Usage: ' + sys.argv [0] + ' text_source [chain_length=1]\n')
sys.exit(1)
filename = sys.argv[1]
markovLength = 1
if len (sys.argv) == 3:
markovLength = int(sys.argv [2])
buildMapping(wordlist(filename), markovLength)
print genSentence(markovLength)
if __name__ == "__main__":
main()