forked from hrs/markov-sentence-generator
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtext_generator.py
executable file
·609 lines (539 loc) · 31.8 KB
/
text_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
#!/usr/bin/env python3.5
# -*- coding: utf-8 -*-
"""This is the actual code implementing Patrick Mooney's Markov chain-based
text generator, separated into a separate module so that it can easily be
compiled with Cython.
This module is licensed under the GNU GPL,either version 3, or (at your option)
any later version. See the files README.md and LICENSE.md for more details.
"""
import pickle
import re
import random
import time
import typing
from pathlib import Path
import text_handling as th # https://github.com/patrick-brian-mooney/personal-library
cython = None
try:
import cython
except Exception as errrr:
print("Unable to import cython! The system said: {}".format(errrr))
__author__ = "Patrick Mooney, http://patrickbrianmooney.nfshost.com/~patrick/"
__version__ = "$v2.3 $"
__date__ = "$Date: 2020/05/05 23:16:00 $"
__copyright__ = "Copyright (c) 2015-20 Patrick Mooney"
__license__ = "GPL v3, or, at your option, any later version"
# logging-related stubs
verbosity_level = 1 # Bump above zero to get more verbose messages about processing and to skip the "are we running on a webserver?" check.
def log_it(what, log_level=1):
"""Handles logging to console, based on current verbosity_level. #FIXME: just use the stdlib logging module.
"""
if log_level <= verbosity_level:
print(what)
# Basic declarations about English-language text.
punct_with_space_after = r'.,\:!?;'
sentence_ending_punct = r'.!?'
punct_with_no_space_before = r'.,!?;—․-:/'
punct_with_no_space_after = r'—-/․' # Note: that last character is U+2024, "one-dot leader".
word_punct = r"'’❲❳%°#․$" # Punctuation marks to be considered part of a word.
token_punct = r".,:\-!?;—/&…⸻" # These punctuation marks also count as tokens.
# First, some utility functions.
def _is_cythonized() -> bool:
if cython:
return cython.compiled
else:
return False
def process_acronyms(text: str) -> str:
"""Takes TEXT and looks through it for acronyms. If it finds any, it takes each
and converts their periods to one-dot leaders to make the Markov parser treat
the acronym as a single word. Returns the modified string.
This function is NEVER called directly by any other routine in this file;
it's a convenience function for code that uses this module. This may change
in the future, if extensive testing shows there are very very few incorrect
corrections made.
"""
remaining_to_process = text[:]
ret = ""
# First, search for and deal with sentence-ending acronyms. Doing this requires replacing their dots with a
# one-dot leader, and then adding a sentence-ending period so the chain parser knows that there's sentence-ending
# punctuation in the text.
while remaining_to_process:
match = re.search(r'([A-Z]\.){2,}\s[A-Z]', remaining_to_process, re.UNICODE) # Find acronym-whitespace-capital letter
if match:
ret += remaining_to_process[:match.start()]
last_period = remaining_to_process[match.start() : match.end()].rfind('.')
ret += remaining_to_process[match.start() : 1 + match.start() + last_period].replace('.', '․')
ret += '.'
ret += remaining_to_process[1 + match.start()+last_period : match.end()]
remaining_to_process = remaining_to_process[match.end():]
else:
ret += remaining_to_process
remaining_to_process = ""
# Now, deal with any remaining unprocessed acronyms.
remaining_to_process, ret = ret, ""
while remaining_to_process:
match = re.search(r'(?:(?<=\.|\s)[A-Z]\.)+', remaining_to_process, re.UNICODE)
if match:
ret += remaining_to_process[:match.start()]
ret += remaining_to_process[match.start():match.end()].replace('.', '․') # Replace periods with one-dot leaders
remaining_to_process = remaining_to_process[match.end():] # Lop off the part we've processed.
else:
ret += remaining_to_process
remaining_to_process = ""
return ret
def to_hash_key(lst: list) -> tuple:
"""Tuples can be hashed; lists can't. We need hashable values for dict keys.
This looks like a hack (and it is, a little) but in practice it doesn't
affect processing time too negatively.
This is no longer used -- tuple() is now called directly to spare a function
call -- but it's been allowed to stay here because of historical affection.
"""
return tuple(lst)
def apply_defaults(defaultargs: dict, args: dict) -> dict:
"""Takes two dictionaries, ARGS and DEFAULTARGS, on the assumption that these are
argument dictionaries for the **kwargs call syntax. Returns a new dictionary that
consists of the elements of ARGS, plus those elements of DEFAULTARGS whose key
names do not appear in ARGS. That is, this function merges the contents of
DEFAULTARGS into ARGS, except for those keys that already exist in ARGS; these
keys keep the value they initially had in ARGS. DEFAULTARGS is only used to
supply missing keys.
"""
ret = defaultargs.copy()
ret.update(args)
return ret
def fix_caps(word: str) -> str:
"""This is Harry Schwartz's token comparison function, allowing words (other than
"I") to be compared regardless of capitalization. I don't tend to use it, but
if you want to, set the comparison_form attribute to point to it: something
like
genny.comparison_function = fix_caps
should work. Note that this function is NEVER called BY DEFAULT; it's a utility
function that's left in place in case anyone else ever wants to use it.
"""
if word.isupper() and word != "I": # I suspect this doesn't work the way Schwartz thinks it does, but haven't tested it.
word = word.lower() # isupper() looks at whether the WHOLE STRING IS CAPITALIZED, not whether it HAS CAPS IN IT.
# Ex: "LaTeX" => "Latex" # So this example doesn't actually describe what's going on.
elif word[0].isupper():
word = th.capitalize(word.lower()) # I keep meaning to report this as a bug. #FIXME
# Ex: "wOOt" -> "woot"
else:
word = word.lower()
return word
class MarkovChainTextModel(object):
"""Chains representing a model of a text."""
def __init__(self):
"""Instantiate a new, empty set of chains."""
self.starts = None # List of tokens allowed at the beginning of a sentence.
self.markov_length = 0 # Length of the chains.
self.mapping = None # Dictionary representing the Markov chains.
self.character_tokens = False # True if the chains are characters, False if they are words.
self.finalized = False
def store_chains(self, filename: typing.Union[str, Path]):
"""Shove the relevant chain-based data into a dictionary, then pickle it and
store it in the designated file.
"""
chains_dictionary = { 'the_starts': self.starts,
'markov_length': self.markov_length,
'the_mapping': self.mapping,
'character_tokens': self.character_tokens }
try:
with open(filename, 'wb') as the_chains_file:
the_pickler = pickle.Pickler(the_chains_file, protocol=-1) # Use the most efficient protocol possible
the_pickler.dump(chains_dictionary)
except IOError as e:
log_it("ERROR: Can't write chains to %s; the system said '%s'." % (filename, str(e)), 0)
except pickle.PickleError as e:
log_it("ERROR: Can't write chains to %s because a pickling error occurred; the system said '%s'." % (filename, str(e)), 0)
def read_chains(self, filename: typing.Union[str, Path]):
"""Read the pickled chain-based data from FILENAME."""
default_chains = { 'character_tokens': False, # We need only assign defaults for keys added in v2.0 and later.
} # the_starts, the_mapping, and markov_length have been around since 1.0.
try:
with open(filename, 'rb') as the_chains_file:
chains_dictionary = pickle.load(the_chains_file)
except IOError as e:
log_it("ERROR: Can't read chains from %s; the system said '%s'." % (filename, str(e)), 0)
except pickle.PickleError as e:
log_it("ERROR: Can't read chains from %s because a pickling error occurred; the system said '%s'." % (filename, str(e)), 0)
chains_dictionary = apply_defaults(defaultargs=default_chains, args=chains_dictionary)
self.markov_length = chains_dictionary['markov_length']
self.starts = chains_dictionary['the_starts']
self.mapping = chains_dictionary['the_mapping']
self.character_tokens = chains_dictionary['character_tokens']
self.finalized = True
class TextGenerator(object):
"""A general-purpose text generator. To use it, instantiate it, train it, and
then have it generate text.
"""
def __init__(self, name: typing.Optional[str]=None, training_texts: typing.Optional[list]=None, **kwargs):
"""Create a new instance. NAME is entirely optional, and is mentioned for
convenience (if it exists) any time a string representation is generated.
If TRAINING_TEXTS is not None, it should be a *list* of one or more
filenames on which the generator will be immediately trained. If you want
to specify parameters to train() other than just a list of files (e.g., if
you want to pass a markov_length parameter so that the chains have a length
greater than one), you can pass them as keyword arguments here, at the end
of the parameter list; anything not collected by the keyword arguments
explicitly specified in this function's definition will be passed on to
train(). (Or, you can instead call train() separately after object
creation, if you wish.)
"""
self.name = name # NAME is totally optional and entirely for your benefit.
self.chains = MarkovChainTextModel() # Markov chain-based representation of the text(s) used to train this generator.
self.allow_single_character_sentences = False # Is this model allowed to produce one-character sentences?
# This next is the default list of substitutions that happen after text is produced.
# List of lists. each sublist:[search_regex, replace_regex]. Subs performed in order specified.
if training_texts:
self.train(training_texts, **kwargs)
final_substitutions = [
['--', '—'],
['\.\.\.', '…'],
['․', '.'], # replace one-dot leader with period
['\.\.', '.'],
[" ' ", ''],
['――', '―'], # Two horizontal bars to one horizontal bar
['―-', '―'], # Horizontal bar-hyphen to single horizontal bar
[':—', ': '],
["\n' ", '\n'], # newline--single quote--space
["<p>'", '<p>'],
["<p> ", '<p>'], # <p>-space to <p> (without space)
["<p></p>", ''], # <p></p> to nothing
['- ', '-'], # hyphen-space to hyphen
['—-', '—'], # em dash-hyphen to em dash
['——', '—'], # two em dashes to one em dash
['([0-9]),\s([0-9])', r'\1,\2'], # Remove spaces after commas when commas are between numbers.
['([0-9]):\s([0-9])', r'\1:\2'], # Remove spaces after colons when colons are between numbers.
['…—', '… —'], # put space in between ellipsis-em dash, if they occur together.
]
def __str__(self):
if self.is_trained():
if self.name:
return '< class %s, named "%s", with Markov length %d >' % (self.__class__, self.name, self.chains.markov_length)
else:
return '< class %s (unnamed instance), with Markov length %d >' % (self.__class__, self.chains.markov_length)
else:
if self.name:
return '< class %s, named "%s", UNTRAINED >' % (self.__class__, self.name)
else:
return '< class %s (unnamed instance), UNTRAINED >' % self.__class__
@staticmethod
def comparison_form(word: str) -> str:
"""This function is called to normalize words for the purpose of storing
them in the list of Markov chains, and for looking at previous words when
deciding what the next word in the sequence should be. By default, this
function performs no processing at all; override it if any preprocessing
should be done for comparison purposes -- for instance, if case needs to be
normalized.
"""
return word
def add_final_substitution(self, substitution: str, position: int=-1):
"""Add another substitution to the list of substitutions performed after text is
generated. Since the final substitutions are performed in the order they're
listed, position matters; the POSITION parameter indicates what position in the
list the new substitution will appear. If POSITION is -1 (the default), the new
substitution appears at the end of the list.
"""
assert isinstance(substitution, (list, tuple)), "ERROR: the substitution you pass in must be a list or tuple."
assert len(substitution) == 2, "ERROR: the substitution you pass in must be two items long."
if position == -1: position = len(self.final_substitutions)
self.final_substitutions.insert(position, substitution)
def remove_final_substitution(self, substitution: str):
"""Remove SUBSTITUTION from the list of final substitutions performed after text
is generated. You must pass in *exactly* the substitution you want to remove.
If you try to remove something that's not there, this routine will let the error
raised by the list (which is always[?] ValueError) propagate -- trap it if you
need to.
"""
assert isinstance(substitution, (list, tuple)), "ERROR: the substitution you pass in must be a list or tuple."
assert len(substitution) == 2, "ERROR: the substitution you pass in must be two items long."
self.final_substitutions.remove(substitution)
def get_final_substitutions(self) -> typing.List[str]: #FIXME: check annotation
"""Returns the list of final substitutions that are performed by the text generator
before returning the text. Just a quick index into a variable in the object
namespace.
"""
return self.final_substitutions
def set_final_substitutions(self, substitutions: typing.List[str]): # FIXME: check annotation
"""Set the list of final substitutions that are performed on generated text before
it's returned. SUBSTITUTIONS must be a list of two-item lists, of the form
[regex to search for, replacement], as in the default list in the __init__()
method for the class.
"""
for sublist in substitutions: # Do some basic error-checking
assert isinstance(sublist, (list, tuple)), "ERROR: substitution %s is not a list or tuple." % sublist
assert len(sublist) == 2, "ERROR: substitution %s is not two items long." % sublist
self.final_substitutions = substitutions
def addItemToTempMapping(self, history: typing.List[str], #FIXME: check annotations
word: str,
weight: typing.Union[float, int]=1.0) -> None:
"""Self-explanatory -- adds "word" to the "the_temp_mapping" dict under "history".
the_temp_mapping (and the_mapping) both match each word to a list of possible next
words.
WEIGHT is a real number (by default, 1.0) indicating how much 'weight' to add to
this mapping in the temporary mapping. Items that are weighted more heavily will
of course be more likely to pop back out.
Given history = ["the", "rain", "in"] and word = "Spain", we add "Spain" to
the entries for ["the", "rain", "in"], ["rain", "in"], and ["in"].
"""
while len(history) > 0:
first = tuple(history)
if first in self.the_temp_mapping:
if word in self.the_temp_mapping[first]:
self.the_temp_mapping[first][word] += weight
else:
self.the_temp_mapping[first][word] = weight
else:
self.the_temp_mapping[first] = dict()
self.the_temp_mapping[first][word] = weight
history = history[1:]
def next(self, prevList: typing.List, #FIXME: check annotations
the_mapping: typing.Dict) -> str:
"""Returns the next word in the sentence (chosen randomly),
given the previous ones.
"""
prevList = [ self.comparison_form(p) for p in prevList ] # Use the canonical comparison form
total = 0.0
ret = ""
index = random.random()
# Shorten prevList until it's in the_mapping
try:
while tuple(prevList) not in the_mapping:
prevList.pop(0) # Just drop the earliest list element & try again if the list isn't in the_mapping
except IndexError: # If we somehow wind up with an empty list (shouldn't happen), then just end the sentence;
ret = "." # this will force the generator to start a new one.
else: # Otherwise, get a random word from the_mapping, given prevList, if prevList isn't empty
for k, v in the_mapping[tuple(prevList)].items():
total += v
if total >= index and ret == "":
ret = k
break
return ret
def _build_mapping(self, token_list: typing.List[str], #FIXME: check annotations
markov_length: int,
character_tokens: bool=False,
learn_starts: bool=True,
weight: typing.Union[float, int]=1.0) -> None:
"""Add the data in TOKEN_LIST to the temporary mapping data that is being built
as the model is trained. If CHARACTER_TOKENS is True, sets the corresponding
flag on the chains (and the chains that are passed in should also be letters,
not words: this is not checked, but getting it wrong may result in weird
behavior later on). MARKOV_LENGTH is of course the length of the Markov chains
being generated. If LEARN_STARTS in True (the default), the first words of
sentences in TOKEN_LIST are also added to the .starts attribute of the chains.
This is normally desirable, and there has to be SOMETHING in .starts for a
set of chains to be able to produce text; but there may be texts (such as texts
containing partial sentences, say) that we want to train a generator on but that
consist of incomplete sentences; setting LEARN_STARTS to False for those texts
allows them to pad the existing chains without worrying that weird uncapitalized
and grammatically weird sentences will result because of this fact. WEIGHT is
the relative weighting to give to these tokens in the mapping; this will be
normalized later when _finalize_mappings is called.
This function does not finalize the mappings by normalizing the frequency
counts; _finalize_mapping needs to be called for that.
"""
try:
_ = self.the_temp_mapping
except AttributeError:
self.the_temp_mapping = dict()
if (not hasattr(self.chains, 'starts')) or not (self.chains.starts):
self.chains.starts = list()
self.chains.markov_length = markov_length
self.chains.character_tokens = character_tokens
if token_list[0] not in self.chains.starts:
self.chains.starts.append(token_list[0])
for i in range(1, len(token_list) - 1):
if i <= markov_length:
history = token_list[: i + 1]
else:
history = token_list[i - markov_length + 1 : i + 1]
follow = token_list[i + 1]
# if the last elt was a sentence-ending punctuation, add the next word to the start list
if learn_starts:
if history[-1] in sentence_ending_punct and follow not in punct_with_space_after:
if follow not in self.chains.starts:
self.chains.starts.append(follow)
self.addItemToTempMapping(history, follow, weight=weight)
def _finalize_mapping(self):
"""Finalize the mapping in SELF by normalizing probability frequencies of
occurrences. This must be done once, after the model has been trained on
all texts, but not more than once. The higher-level train() method calls it and
is a good choice if we're passing in one set of texts that has one set of
parameters that get set all at once. For fiddlier training processes that don't
happen atomically, it should be called manually once everything is done.
"""
# First, check the invariants
assert hasattr(self, 'the_temp_mapping'), "ERROR! The text generator has not been trained!"
assert self.the_temp_mapping, "ERROR! Training for the text generator has not begun!"
assert hasattr(self.chains, 'starts'), "ERROR! The text generator's training did not result in any sentence beginnings!"
assert self.chains.starts, "ERROR! The text generator's training did not result in any sentence beginnings!"
# Next, restrict the possible range of STARTS if we're using single-chracter chains.
if self.chains.character_tokens:
self.chains.starts = [c for c in self.chains.starts if c.isupper()]
# Then, normalize the frequencies and install the new dictionary in the object's mappings.
the_mapping = dict()
for first, followset in self.the_temp_mapping.items():
total = sum(followset.values())
the_mapping[first] = dict([(k, v / total) for k, v in followset.items()]) # Here's the normalizing step.
self.chains.mapping = the_mapping
# Clean up and mark finalized.
del self.the_temp_mapping
self.chains.finalized = True
@staticmethod
def _tokenize_string(the_string: str) -> typing.List[str]:
"""Split a string into tokens, which more or less correspond to words. More aware
than a naive str.split() because it takes punctuation into account to some
extent.
"""
return re.findall(r"[\w%s]+|[%s]" % (word_punct, token_punct), the_string)
def _token_list(self, the_string: str,
character_tokens: bool=False) -> typing.List[str]:
"""Converts a string into a set of tokens so that the text generator can
process, and therefore be trained by, it.
"""
if character_tokens:
tokens = list(the_string)
else:
tokens = self._tokenize_string(the_string)
return [self.comparison_form(w) for w in tokens]
def is_trained(self) -> bool:
"""Detect whether this model is trained or not."""
return all([self.chains.finalized, self.chains.starts, self.chains.mapping, self.chains.markov_length])
def _train_from_text(self, the_text: str,
markov_length: int=1,
character_tokens: bool=False,
weight: typing.Union[float, int]=1.0,
learn_starts: bool=True) -> None:
"""Train the model by getting it to analyze a text passed in. Note that THE_TEXT is
a single string here. MARKOV_LENGTH is, of course, the length of the Markov
chains to generate; CHARACTER_TOKENS indicates whether tokens are single
characters (if it is True) or whole words (if it is False). WEIGHT is the
relative numerical weighting to give to this piece of text. LEARN_STARTS
toggles whether the beginnings of sentences are added to the generator's
.starts list, which is generally desirable but needs to be turned off in some
situations.
"""
assert the_text, "ERROR! blank text was passed to _train_from_text()!"
self._build_mapping(self._token_list(the_text, character_tokens=character_tokens),
markov_length=markov_length, character_tokens=character_tokens,
weight=weight, learn_starts=learn_starts)
def train(self, the_files: typing.Union[str, bytes, Path, typing.List[typing.Union[str, bytes, Path]]],
markov_length: int=1,
character_tokens: bool=False) -> None:
"""Train the model from a text file, or a list of text files, supplied as THE_FILES.
This routine is the easiest way to train a generator all at once on a single
file or set of files that all have the same training parameters. Fiddlier
training process will need to call _train_from_text() manually at least once,
then _finalize_mapping when all of the mappings have been created.
"""
if isinstance(the_files, (str, bytes, Path)):
the_files = [ the_files ]
assert isinstance(the_files, (list, tuple)), "ERROR: you cannot pass an object of type %s to %s.train" % (type(the_files), self)
assert len(the_files) > 0, "ERROR: empty file list passed to %s.train()" % self
the_text = ""
for which_file in the_files:
with open(which_file) as the_file:
the_text = the_text + '\n' + the_file.read()
self._train_from_text(the_text=the_text, markov_length=markov_length, character_tokens=character_tokens)
self._finalize_mapping()
def _gen_sentence(self) -> str:
"""Build a sentence, starting with a random 'starting word.' Returns a string,
which is the generated sentence.
"""
assert self.is_trained(), "ERROR: the model %s needs to be trained before it can generate text!" % self
log_it(" _gen_sentence() called.", 4)
log_it(" markov_length = %d." % self.chains.markov_length, 5)
log_it(" the_mapping = %s." % self.chains.mapping, 5)
log_it(" starts = %s." % self.chains.starts, 5)
log_it(" allow_single_character_sentences = %s." % self.allow_single_character_sentences, 5)
curr = random.choice(self.chains.starts)
sent = curr
prevList = [curr]
# Keep adding words until we hit a period, exclamation point, or question mark
while curr not in sentence_ending_punct:
curr = self.next(prevList, self.chains.mapping)
prevList.append(curr)
# if the prevList has gotten too long, trim it
while len(prevList) > self.chains.markov_length:
prevList.pop(0)
if not self.chains.character_tokens: # Don't add spaces between tokens that are just single characters.
if curr not in punct_with_no_space_before:
if (len(prevList) < 2 or prevList[-2] not in punct_with_no_space_after):
sent += " " # Add spaces between words (but not punctuation)
sent += curr
if not self.allow_single_character_sentences:
if len(sent.strip().strip(sentence_ending_punct).strip()) == 1:
if sent.strip().strip(sentence_ending_punct).strip().upper() != "I":
sent = self._gen_sentence() # Retry, recursively.
return th.capitalize(sent)
def _produce_text(self, sentences_desired: int=1,
paragraph_break_probability: float=0.25) -> str:
"""Actually generate some text. This is a generator function that produces (yields)
one paragraph at a time. If you just need all the text at once, you might want
to use the convenience wrapper gen_text() instead.
"""
log_it("_produce_text() called.", 4)
log_it(" Markov length is %d; requesting %d sentences." % (self.chains.markov_length, sentences_desired), 4)
log_it(" Legitimate starts: %s" % self.chains.starts, 5)
log_it(" Probability data: %s" % self.chains.mapping, 5)
the_text = ""
for which_sentence in range(0, sentences_desired):
try:
if the_text[-1] != "\n": # If we're not starting a new paragraph ...
the_text = the_text + " " # ... add a space after the sentence-ending punctuation.
except IndexError: # If this is the very beginning of our generated text ...
pass # ... well, we don't need to add a space to the beginning of the text, then.
the_text = the_text + self._gen_sentence()
if random.random() <= paragraph_break_probability or which_sentence == sentences_desired - 1:
the_text = th.multi_replace(the_text, self.final_substitutions)
try:
yield the_text.strip() + "\n"
except RuntimeError: # Conforms to Python 3.7 changes in behavior. Sigh.
return
the_text = ""
def gen_text(self, sentences_desired: int=1,
paragraph_break_probability: float=0.25) -> str:
"""Generate the full amount of text required. This is just a convenience wrapper
for _produce_text().
"""
return '\n'.join(self._produce_text(sentences_desired, paragraph_break_probability))
def gen_html_frag(self, sentences_desired: int=1,
paragraph_break_probability: float=0.25):
"""Produce the same text that _produce_text would, but wrapped in HTML <p></p> tags."""
log_it("We're generating an HTML fragment.", 3)
the_text = self._produce_text(sentences_desired, paragraph_break_probability)
return '\n\n'.join(['<p>%s</p>' % p.strip() for p in the_text])
def _printer(self, what: str,
columns: int=-1):
"""Print WHAT in an appropriate way, wrapping to the specified number of
COLUMNS. If COLUMNS is -1, take a whack at guessing what it should be. If
COLUMNS is zero, do no wrapping at all.
"""
if columns == 0: # Wrapping is totally disabled. Print exactly as generated.
log_it("INFO: COLUMNS is zero; not wrapping text at all", 3)
print(what)
else:
if columns == -1: # Wrap to best guess for terminal width
log_it("INFO: COLUMNS is -1; wrapping text to best-guess column width", 3)
padding = 0
else: # Wrap to specified width (unless current terminal width is odd, in which case we're off by 1/2. Oh well.)
padding = max((th.terminal_width() - columns) // 2, 0)
log_it("INFO: COLUMNS is %s; padding text with %s spaces on each side" % (columns, padding), 3)
log_it("NOTE: terminal width is %s" % th.terminal_width(), 3)
what = th.multi_replace(what, [['\n\n', '\n'], ]) # Last chance to postprocess text is right here
for the_paragraph in what.split('\n'):
if the_paragraph: # Skip any empty paragraphs that may pop up
th.print_indented(the_paragraph, each_side=padding)
print()
def print_text(self, sentences_desired: int,
paragraph_break_probability: float=0.25,
pause: float=0,
columns: int=-1):
"""Prints generated text directly to stdout."""
for t in self._produce_text(sentences_desired, paragraph_break_probability):
time_now = time.time()
self._printer(t, columns=columns)
time.sleep(max(pause - (time.time() - time_now), 0)) # Pause until it's time for a new paragraph.
if __name__ == "__main__":
gen = TextGenerator()
gen.train(['/lovecraft/corpora/previous/The Alchemist.txt'])
gen.print_text(25)