forked from hrs/markov-sentence-generator
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathchain-interpreter.py
executable file
·132 lines (111 loc) · 4.94 KB
/
chain-interpreter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""A utility unit that generates new text based on Markov chains created from one
or more source texts. THIS unit is a stripped-down version of my own more
full-featured text generator, which is in turn based on code by HR Schwartz;
my more full-featured code is available at
https://github.com/patrick-brian-mooney/markov-sentence-generator
and a link to Schwartz's original source is available in the same place. Note
that the full-featured version is required to GENERATE files that this module
can only INTERPRET. That is: this stripped-down version is a utility for
projects that need to generate text based on pre-generated chains, but don't
need the additional features provided by the larger model.
"""
import pickle, random, sys
punct_with_space_after = r'.,\:!?;'
sentence_ending_punct = r'.!?'
punct_with_no_space_before = r'.,!?;—․-:/'
punct_with_no_space_after = r'—-/․'
def to_hash_key(lst):
return tuple(lst)
def _find_first_alphanumeric(w):
for i, c in enumerate(w):
if c.isalpha() or c.isnumeric():
return i
return -1
def capitalize(w):
if len(w) < 2: return w
f = _find_first_alphanumeric(w)
if f == -1:
return w
else:
return w[:f] + w[f].upper() + w[1 + f:]
class MarkovChainTextModel(object):
def __init__(self, filename):
try:
with open(filename, 'rb') as the_chains_file:
chains_dictionary = pickle.load(the_chains_file)
except IOError as e:
print("ERROR: Can't read chains from %s; the system said '%s'." % (filename, e))
sys.exit(1)
except pickle.PickleError as e:
print("ERROR: Can't read chains from %s because a pickling error occurred; the system said '%s'." % (filename, e))
sys.exit(2)
self.markov_length = chains_dictionary['markov_length']
self.the_starts = chains_dictionary['starts']
self.the_mapping = chains_dictionary['the_mapping']
self.character_tokens = chains_dictionary['character_tokens']
assert not self.character_tokens, "ERROR: this script cannot interpret 'character token' Markov chain files."
class TextGenerator(object):
def __init__(self, chainsfile, name=None):
self.name = name
self.chains = MarkovChainTextModel(chainsfile)
def __str__(self):
if self.name:
return '< class %s, named "%s", with Markov length %d >' % (self.__class__, self.name, self.chains.markov_length)
else:
return '< class %s (unnamed instance), with Markov length %d >' % (self.__class__, self.chains.markov_length)
def next(self, prevList, the_mapping):
prevList = [ p for p in prevList ]
sum = 0.0
retval = ""
index = random.random()
try:
while to_hash_key(prevList) not in the_mapping:
prevList.pop(0)
except IndexError:
retval = "."
else:
for k, v in the_mapping[to_hash_key(prevList)].items():
sum += v
if sum >= index and retval == "":
retval = k
break
return retval
def is_trained(self):
return (self.chains.the_starts and self.chains.the_mapping and self.chains.markov_length)
def _gen_sentence(self):
curr = random.choice(self.chains.the_starts)
sent = curr
prevList = [curr]
while curr not in sentence_ending_punct:
curr = self.next(prevList, self.chains.the_mapping)
prevList.append(curr)
while len(prevList) > self.chains.markov_length:
prevList.pop(0)
if curr not in punct_with_no_space_before:
if (len(prevList) < 2 or prevList[-2] not in punct_with_no_space_after):
sent += " "
sent += curr
if len(sent.strip().strip(sentence_ending_punct).strip()) == 1:
if sent.strip().strip(sentence_ending_punct).strip().upper() != "I":
sent = self._gen_sentence()
return capitalize(sent)
def _produce_text(self, sentences_desired=1, paragraph_break_probability=0.25):
the_text = ""
for which_sentence in range(0, sentences_desired):
try:
if the_text[-1] != "\n":
the_text = the_text + " "
except IndexError:
pass
the_text = the_text + self._gen_sentence()
if random.random() <= paragraph_break_probability or which_sentence == sentences_desired - 1:
yield the_text.strip() + "\n"
the_text = ""
raise StopIteration
if __name__ == "__main__":
if len(sys.argv) < 2: fname = '/home/patrick/Documents/programming/python_projects/AutoLovecraft/corpora/previous/All Edited Texts.3.pkl'
else: fname = sys.argv[1]
genny = TextGenerator(chainsfile = fname)
print('\n'.join(genny._produce_text(sentences_desired=20)))