Random sentence from book: Difference between revisions

Content added Content deleted

Inline

Revision as of 19:29, 15 February 2021

Read in the book "The War of the Worlds", by H. G. Wells.
Skip to the start of the book, proper.
Remove extraneous punctuation, but keep at least sentence-ending punctuation characters . ! and ?
Keep account of what words follow words and how many times it is seen, (treat sentence terminators as words too).
Keep account of what words follow two words and how many times it is seen, (again treating sentence terminators as words too).
Assume that a sentence starts with a not to be shown full-stop character then use a weighted random choice of the possible words that may follow a full-stop to add to the sentence.
Then repeatedly add words to the sentence based on weighted random choices of what words my follow the last two words to extend the sentence.
Stop when the sentence ends in a sentence ending punctuation character.
Tidy and then print the sentence.

Show examples of random sentences generated.

ALGOL 68

Works with: ALGOL 68G version Any - tested with release 2.8.3.win32

<lang algol68># generate random sentences using text from a book as a basis #

use the associative array in the Associate array/iteration task #

PR read "aArray.a68" PR

returns s with chars removed #

PRIO REMOVE = 1; OP REMOVE = ( STRING s, chars )STRING:

    BEGIN
       [ LWB s : UPB s ]CHAR result;
       INT r pos := LWB result - 1;
       FOR s pos FROM LWB s TO UPB s DO
           IF NOT char in string( s[ s pos ], NIL, chars ) THEN
               # have a character that needn't be removed           #
               r pos +:= 1;
               result[ r pos ] := s[ s pos ]
           FI
       OD;
       result[ LWB s : r pos ]
    END # REMOVE # ;

returns text converted to an INT or -1 if text is not a number #

OP TOINT = ( STRING text )INT:

    BEGIN
       INT  result     := 0;
       BOOL is numeric := TRUE;
       FOR ch pos FROM LWB text TO UPB text WHILE is numeric DO
           CHAR c = text[ ch pos ];
           is numeric := ( c >= "0" AND c <= "9" );
           IF is numeric THEN ( result *:= 10 ) +:= ABS c - ABS "0" FI        
       OD;
       IF is numeric THEN result ELSE -1 FI
    END # TOINT # ;

get the file name and number of words for the prefix and #
max number of words and sentences from the command line #

STRING file name := "twotw.txt"; STRING start word := ""; INT prefix length := 2; INT number of sentences := 10; INT max words := 1 000 000; FOR arg pos TO argc - 1 DO

   STRING arg upper := argv( arg pos );
   FOR ch pos FROM LWB arg upper TO UPB arg upper DO
       IF is lower( arg upper[ ch pos ] ) THEN arg upper[ ch pos ] := to upper( arg upper[ ch pos ] ) FI
   OD;
   IF   arg upper  = "FILE"   THEN
       file name           :=       argv( arg pos + 1 )
   ELIF arg upper  = "PREFIX" THEN
       prefix length       := TOINT argv( arg pos + 1 )
   ELIF arg upper  = "SENTENCES" THEN
       number of sentences := TOINT argv( arg pos + 1 )
   ELIF arg upper  = "MAXWORDS" THEN
       max words           := TOINT argv( arg pos + 1 )
   ELIF arg upper  = "STARTWORD" THEN
       start word          :=       argv( arg pos + 1 )
   FI

OD;

delimiter for separating suffixes - must not appear in the text #

CHAR suffix delimiter = REPR 1; # ^A # STRING punctuation = """'@,/;:(){}[]*&^%$£";

IF FILE input file;

   open( input file, file name, stand in channel ) /= 0

THEN

   # failed to open the file #
   print( ( "Unable to open """ + file name + """", newline ) )

ELSE

   # file opened OK #
   BOOL at eof := FALSE;
   BOOL at eol := FALSE;
   # set the EOF handler for the file #
   on logical file end( input file
                      , ( REF FILE f )BOOL:
                        BEGIN
                            # note that we reached EOF on the #
                            # latest read #
                            at eof := TRUE;
                            # return TRUE so processing can continue #
                            TRUE
                        END
                      );
   # set the end-of-line handler for the file so get word can see line boundaries #
   on line end( input file
              , ( REF FILE f )BOOL:
                BEGIN
                    # note we reached end-of-line #
                    at eol := TRUE;
                    # return FALSE to use the default eol handling  #
                    # i.e. just get the next charactefr             #
                    FALSE
                END
              );
   CHAR   c    := " ";
   # returns the next word from input file                          #
   # a word is any sequence of characters separated by spaces and   #
   # suffix delimiters, or one of the characters ".", "!" or "?"    #
   PROC get word = STRING:
        IF at eof THEN ""
        ELSE # not at end of file                                   #
           STRING word := "";
           at eol := FALSE;
           IF c = "." OR c = "!" OR c = "?" THEN
               # sentence ending "word"                             #
               word := c;
               get( input file, ( c ) )
           ELSE
               # "normal" word                                      #
               WHILE ( c = " " OR c = suffix delimiter ) AND NOT at eof DO get( input file, ( c ) ) OD;
               WHILE c /= " "
                 AND c /= "."
                 AND c /= "!"
                 AND c /= "?"
                 AND c /= suffix delimiter
                 AND NOT at eol
                 AND NOT at eof
               DO
                   word +:= c;
                   get( input file, ( c ) )
               OD
           FI;
           at eol := FALSE;
           word
        FI # get word # ;

   # returns a random number between 1 and n inclusive              #
   PROC random choice = ( INT n )INT: IF n < 2 THEN n ELSE ENTIER ( ( next random * n ) + 1 ) FI;

   # chooses a suffix at random to continue a sentence              #
   PROC choose suffix = ( STRING sfxs )STRING:
        BEGIN
           # count the number of suffixes                           #
           INT suffix max := 0;
           FOR s pos FROM LWB sfxs TO UPB sfxs DO
              IF sfxs[ s pos ] = suffix delimiter THEN suffix max +:= 1 FI
           OD;
           # select a random suffix to continue the text with       #
           STRING sfx          := "";
           INT    prev pos     := LWB sfxs - 1;
           INT    suffix count := random choice( suffix max );
           FOR s pos FROM LWB sfxs TO UPB sfxs WHILE suffix count > 0  DO
               IF sfxs[ s pos ] = suffix delimiter THEN
                   # found the end of a suffix                      #
                   sfx           := sfxs[ prev pos + 1 : s pos - 1 @ 1 ];
                   prev pos      := s pos;
                   suffix count -:= 1
               FI
           OD;
           sfx
        END # choose suffix # ;

   # skip to the start word, if there is one                        #
   IF start word /= "" THEN WHILE NOT at eof AND get word /= start word DO SKIP OD FI;
   # get the first prefix from the file                             #
   [ prefix length ]STRING prefix;
   FOR p pos TO prefix length WHILE NOT at eof DO prefix[ p pos ] := get word OD;
   IF at eof THEN
       # not enough words in the file                               #
       print( ( file name, " contains less than ", whole( prefix length, 0 ), " words", newline ) )
   ELSE
       # have some words                                            #
       INT word count := prefix length;
       # store the prefixes and suffixes in the associatibe array   #
       # we store the suffix as a single concatenated               #
       # string delimited by suffix delimiters, the string will     #
       # have a leading delimiter                                   #
       # suffixes that appear multiple times in the input text will #
       # appear multiple time in the array, this will allow them to #
       # have a higher probability than suffixes that appear fewer  #
       # times                                                      #
       # this will use more memory than storing the sufixes and a   #
       # count, but simplifies the generation                       #
       # with a prefix length of 2 (as required by the task),       #
       # the War Of The Worlds can be processed - for longer prefix #
       # lengths a less memory hungry algorithm would be needed     #
       REF AARRAY suffixes := INIT LOC AARRAY;
       INT prefix count    := 0;
       WHILE NOT at eof AND word count <= max words
       DO
           # concatenate the prefix words to a single string        #
           STRING prefix text := prefix[ 1 ];
           FOR p pos FROM 2 TO prefix length DO prefix text +:= ( " " + prefix[ p pos ] ) OD;
           STRING suffix := get word;
           # if the prefix has no lower case, ignore it as it is    #
           # probably a chapter heading or similar                  #
           IF BOOL has lowercase := FALSE;
              FOR s pos FROM LWB prefix text TO UPB prefix text
              WHILE NOT ( has lowercase := is lower( prefix text[ s pos ] ) )
              DO SKIP OD;
              has lowercase
           THEN
               # the prefix contains some lower case                #
               # store the suffixes associated with the prefix      #
               IF NOT ( suffixes CONTAINSKEY prefix text ) THEN
                   # first time this prefix has appeared            #
                   prefix count +:= 1
               FI;
               IF prefix[ 1 ] = "." OR prefix[ 1 ] = "!" OR prefix[ 1 ] = "?" THEN
                   # have the start of a sentence                   #
                   suffixes // "*." +:= ( suffix delimiter + prefix text )
               FI;
               STRING prefix without punctuation = prefix text REMOVE punctuation;
               IF prefix without punctuation /= "" THEN prefix text := prefix without punctuation FI;
               suffixes // prefix text +:= ( suffix delimiter + suffix )
           FI;
           # shuffle the prefixes down one and add the new suffix   #
           # as the final prefix                                    #
           FOR p pos FROM 2 TO prefix length DO prefix[ p pos - 1 ] := prefix[ p pos ] OD;
           prefix[ prefix length ] := suffix;
           IF NOT at eof THEN word count +:= 1 FI
       OD;

       # generate text                                                  #
       TO number of sentences DO
           print( ( newline ) );
           # start with a random prefix                                 #
           STRING pfx      := choose suffix( suffixes // "*." );
           STRING line     := pfx[ @ 1 ][ 3 : ]; # remove the leading   #
                                                 #   ". " from the line #
           pfx             := pfx REMOVE punctuation;
           BOOL   finished := FALSE;
           WHILE NOT finished DO
               IF STRING sfxs := ( suffixes // pfx );
                  IF LWB sfxs <= UPB sfxs THEN
                      IF sfxs[ LWB sfxs ] = suffix delimiter THEN sfxs := sfxs[ LWB sfxs + 1 : ] FI
                  FI;
                  sfxs +:= suffix delimiter;
                  sfxs = suffix delimiter
               THEN
                   # no suffix - reached the end of the generated text  #
                   line +:= " (" + pfx + " has no suffix)";
                   finished := TRUE
               ELSE
                   # can continue to generate text                      #
                   STRING sfx = choose suffix( sfxs );
                   IF sfx = "." OR sfx = "!" OR sfx = "?"
                   THEN
                       # reached the end of a sentence                  #
                       finished := TRUE;
                       # if the line ends with ",;:", remove it         #
                       INT    line end := UPB line;
                       IF CHAR c = line[ line end ]; c = "," OR c = ";" OR c = ":" THEN
                           line end -:= 1
                       FI;
                       # remove trailing spaces                         #
                       WHILE line[ line end ] = " " AND line end > LWB line DO line end -:= 1 OD;
                       line := line[ LWB line : line end ] + sfx
                   ELSE
                       # not at the end of the sentence                 #
                       line +:= " " + sfx;
                       # remove the first word from the prefix and add  #
                       # the suffix                                     #
                       IF  INT space pos := 0;
                           NOT char in string( " ", space pos, pfx )
                       THEN
                           # the prefix is only one word                #
                           pfx := sfx
                       ELSE
                           # have multiple words                        #
                           pfx := ( pfx[ space pos + 1 : ] + " " + sfx )[ @ 1 ]
                       FI;
                       STRING pfx without punctuation = pfx REMOVE punctuation;
                       IF pfx without punctuation /= "" THEN pfx := pfx without punctuation FI
                   FI
               FI
           OD;
           print( ( line, newline ) )
       OD
   FI;
   close( input file )

FI</lang>

Output:

Sample output produced with the command-line:
a68g randomSentenceFromBook.a68 - FILE twotw.txt PREFIX 2 SENTENCES 10 STARTWORD cover MAXWORDS 60075
One of the sentences has been manually split over two lines.


The wine press of God that sometimes comes into the water mains near the Martians.

They said nothing to tell people until late in the back of this in the early dawn the curve of Primrose Hill.

At last as the day became excessively hot, and close, behind him, opened, and the South-Eastern and the morning sunlight.

"Are we far from Sunbury?

Since the night.

In one place but some mouldy cheese.

Then a dirty woman, carrying a baby, Gregg the butcher and his little boy, and two of them, stark and silent eloquent lips.

Unable from his window sash, and heads in every direction over the brim of which gripped a young pine trees, about the guns were waiting.

And this was the sense to keep up his son with a heavy explosion shook the air, of it first from my newspaper boy about a quarter of the heat,
    of the whole place was impassable.

Presently, he came hurrying after me he barked shortly.

Julia

<lang julia>""" weighted random pick of items in a Dict{String, Int} where keys are words, values counts """ function weightedrandompick(dict, total)

   n = rand(1:total)
   for key in keys(dict)
       n -= dict[key]
       if n <= 0
           return key
       end
   end
   return last(keys(dict))

end

let

   """ Read in the book "The War of the Worlds", by H. G. Wells. """
   wotw_uri =  "http://www.gutenberg.org/files/36/36-0.txt"
   wfile = "war_of_the_worlds.txt"
   stat(wfile).size == 0 && download(wotw_uri, wfile)  # download if file not here already
   text = read(wfile, String)

   """skip to start of book and prune end """
   startphrase, endphrase = "No one would have believed", "she has counted me, among the dead"
   text = text[findfirst(startphrase, text).start:findlast(endphrase, text).stop]

   """ Remove extraneous punctuation, but keep at least sentence-ending punctuation characters . ! and ? """
   text = replace(replace(text, r"[^01-9a-zA-Z\.\?\!’,]" => " "), r"([.?!])" => s" \1")
   words = split(text, r"\s+")
   for (i, w) in enumerate(words)
       w != "I" && i > 1 && words[i - 1] in [".", "?", "!"] && (words[i] = lowercase(words[i]))
   end

   """ Keep account of what words follow words and how many times it is seen. 
       (Treat sentence terminators as words too). Keep account of what words follow two words 
       and how many times it is seen, (again treating sentence terminators as words too).
   """
   follows, follows2 = Dict{String, Dict{String, Int}}(), Dict{String, Dict{String, Int}}()
   afterstop, wlen = Dict{String, Int}(), length(words)
   for (i, w) in enumerate(@view words[1:end-1])
       d = get!(follows, w, Dict(words[i + 1] => 0))
       get!(d, words[i + 1], 0)
       d[words[i + 1]] += 1
       if w in [".", "?", "!"]
           d = get!(afterstop, words[i + 1], 0)
           afterstop[words[i + 1]] += 1
       end
       (i > wlen - 2) && continue
       w2 = w * " " * words[i + 1]
       d = get!(follows2, w2, Dict(words[i + 2] => 0))
       get!(d, words[i + 2], 0)
       d[words[i + 2]] += 1
   end
   followsums = Dict(key => sum(values(follows[key])) for key in keys(follows))
   follow2sums = Dict(key => sum(values(follows2[key])) for key in keys(follows2))
   afterstopsum = sum(values(afterstop))

  """
  Assume that a sentence starts with a not to be shown full-stop character
  then use a weighted random choice of the possible words that may follow a
  full-stop to add to the sentence. (Here we use '.', '?', or '!' for the full stop character.)
  """
   function makesentence()
       firstword = weightedrandompick(afterstop, afterstopsum)
       sentencewords = [firstword, weightedrandompick(follows[firstword], followsums[firstword])]
       while !(sentencewords[end] in [".", "?", "!"])
           w2 = sentencewords[end-1] * " " * sentencewords[end]
           if haskey(follows2, w2)
               push!(sentencewords, weightedrandompick(follows2[w2], follow2sums[w2]))
           else
               push!(sentencewords, weightedrandompick(afterstop, afterstopsum))
           end
       end
       sentencewords[1] = uppercase(firstword[1]) * (length(firstword) > 1 ? firstword[2:end] : "")
       println(join(sentencewords[1:end-1], " ") * sentencewords[end] * "\n")
   end

   makesentence(); makesentence(); makesentence()

end

</lang>

Output:

(RUN:)

It may be lying dead there!

I can imagine them covered with smashed windows and saw the flashes of flame flashed up
and saw through a culvert.

I remember how mockingly bright the sky was still doubtful it rapped smartly against the
starlight from the sun blazed dazzling in a flash I was beginning to face these things
but later I perceived a hold on me and rapidly growing hotter.

(RUN:)

It was this cylinder.

Ogilvy watched till one, and they say there’s been guns heard at Chertsey, heavy firing,
and that every other man still wore his dirty rags.

My companion had been enlarged, and ever!

(RUN:)

Survivors on castle hill alive but helplessly and speechlessly drunk.

Before they were killed.

The landlord should leave his.

(RUN:)

And a cheer that seemed so happy and bright.

Once down one of the tangled maze of streets would have questioned my intellectual
superiority to his feet and had been in active service and he turned to see Lord Hilton,
the lord of the parapet.

What has happened?

Python

Extended to preserve some extra "sentence pausing" characters and try and tidy-up apostrophes.

<lang python>from urllib.request import urlopen import re from string import punctuation from collections import Counter, defaultdict import random

The War of the Worlds, by H. G. Wells

text_url = 'http://www.gutenberg.org/files/36/36-0.txt' text_start = 'No one would have believed'

sentence_ending = '.!?' sentence_pausing = ',;:'

def read_book(text_url, text_start) -> str:

   with urlopen(text_url) as book:
       text = book.read().decode('utf-8')
   return text[text.index(text_start):]

def remove_punctuation(text: str, keep=sentence_ending+sentence_pausing)-> str:

   "Remove punctuation, keeping some"
   to_remove = .join(set(punctuation) - set(keep))
   text = text.translate(str.maketrans(to_remove, ' ' * len(to_remove))).strip()
   text = re.sub(fr"[^a-zA-Z0-9{keep}\n ]+", ' ', text)
   # Remove duplicates and put space around remaining punctuation
   if keep:
       text = re.sub(f"([{keep}])+", r" \1 ", text).strip()
   if text[-1] not in sentence_ending:
       text += ' .'
   return text.lower()

def word_follows_words(txt_with_pauses_and_endings):

   "return dict of freq of words following one/two words"
   words = ['.'] + txt_with_pauses_and_endings.strip().split()

   # count of what word follows this
   word2next = defaultdict(lambda :defaultdict(int))
   word2next2 = defaultdict(lambda :defaultdict(int))
   for lh, rh in zip(words, words[1:]):
       word2next[lh][rh] += 1
   for lh, mid, rh in zip(words, words[1:], words[2:]):
       word2next2[(lh, mid)][rh] += 1

   return dict(word2next), dict(word2next2)

def gen_sentence(word2next, word2next2) -> str:

   s = ['.']
   s += random.choices(*zip(*word2next[s[-1]].items()))
   while True:
       s += random.choices(*zip(*word2next2[(s[-2], s[-1])].items()))
       if s[-1] in sentence_ending:
           break

   s  = ' '.join(s[1:]).capitalize()
   s = re.sub(fr" ([{sentence_ending+sentence_pausing}])", r'\1', s)
   s = re.sub(r" re\b", "'re", s)
   s = re.sub(r" s\b", "'s", s)
   s = re.sub(r"\bi\b", "I", s)

   return s

if __name__ == "__main__":

   txt_with_pauses_and_endings = remove_punctuation(read_book(text_url, text_start))
   word2next, word2next2 = word_follows_words(txt_with_pauses_and_endings)
   #%%
   sentence = gen_sentence(word2next, word2next2)
   print(sentence)</lang>

Output:

<# A SAMPLE OF GENERATED SENTENCES

As I stood petrified and staring down the river, over which spread a multitude of dogs, I flung myself forward under the night sky, a sky of gold.

He was walking through the gaps in the water.

There was no place to their intelligence, without a word they were in position there.

Ugh!

The ringing impact of trucks, the person or entity that provided you with the torrent to recover it.