Word frequency: Difference between revisions
(Added Kotlin) |
|||
Line 255: | Line 255: | ||
that 9 7924 |
that 9 7924 |
||
it 10 6661 |
it 10 6661 |
||
</pre> |
|||
=={{header|Simula}}== |
|||
<lang simula>COMMENT COMPILE WITH |
|||
$ cim -m64 word-count.sim |
|||
; |
|||
BEGIN |
|||
COMMENT ----- CLASSES FOR GENERAL USE ; |
|||
! ABSTRACT HASH KEY TYPE ; |
|||
CLASS HASHKEY; |
|||
VIRTUAL: |
|||
PROCEDURE HASH IS |
|||
INTEGER PROCEDURE HASH;; |
|||
PROCEDURE EQUALTO IS |
|||
BOOLEAN PROCEDURE EQUALTO(K); REF(HASHKEY) K;; |
|||
BEGIN |
|||
END HASHKEY; |
|||
! ABSTRACT HASH VALUE TYPE ; |
|||
CLASS HASHVAL; |
|||
BEGIN |
|||
! THERE IS NOTHING REQUIRED FOR THE VALUE TYPE ; |
|||
END HASHVAL; |
|||
CLASS HASHMAP; |
|||
BEGIN |
|||
CLASS INNERHASHMAP(N); INTEGER N; |
|||
BEGIN |
|||
INTEGER PROCEDURE INDEX(K); REF(HASHKEY) K; |
|||
BEGIN |
|||
INTEGER I; |
|||
IF K == NONE THEN |
|||
ERROR("HASHMAP.INDEX: NONE IS NOT A VALID KEY"); |
|||
I := MOD(K.HASH,N); |
|||
LOOP: |
|||
IF KEYTABLE(I) == NONE OR ELSE KEYTABLE(I).EQUALTO(K) THEN |
|||
INDEX := I |
|||
ELSE BEGIN |
|||
I := IF I+1 = N THEN 0 ELSE I+1; |
|||
GO TO LOOP; |
|||
END; |
|||
END INDEX; |
|||
! PUT SOMETHING IN ; |
|||
PROCEDURE PUT(K,V); REF(HASHKEY) K; REF(HASHVAL) V; |
|||
BEGIN |
|||
INTEGER I; |
|||
IF V == NONE THEN |
|||
ERROR("HASHMAP.PUT: NONE IS NOT A VALID VALUE"); |
|||
I := INDEX(K); |
|||
IF KEYTABLE(I) == NONE THEN BEGIN |
|||
IF SIZE = N THEN |
|||
ERROR("HASHMAP.PUT: TABLE FILLED COMPLETELY"); |
|||
KEYTABLE(I) :- K; |
|||
VALTABLE(I) :- V; |
|||
SIZE := SIZE+1; |
|||
END ELSE |
|||
VALTABLE(I) :- V; |
|||
END PUT; |
|||
! GET SOMETHING OUT ; |
|||
REF(HASHVAL) PROCEDURE GET(K); REF(HASHKEY) K; |
|||
BEGIN |
|||
INTEGER I; |
|||
IF K == NONE THEN |
|||
ERROR("HASHMAP.GET: NONE IS NOT A VALID KEY"); |
|||
I := INDEX(K); |
|||
IF KEYTABLE(I) == NONE THEN |
|||
GET :- NONE ! ERROR("HASHMAP.GET: KEY NOT FOUND"); |
|||
ELSE |
|||
GET :- VALTABLE(I); |
|||
END GET; |
|||
PROCEDURE CLEAR; |
|||
BEGIN |
|||
INTEGER I; |
|||
FOR I := 0 STEP 1 UNTIL N-1 DO BEGIN |
|||
KEYTABLE(I) :- NONE; |
|||
VALTABLE(I) :- NONE; |
|||
END; |
|||
SIZE := 0; |
|||
END CLEAR; |
|||
! DATA MEMBERS OF CLASS HASHMAP ; |
|||
REF(HASHKEY) ARRAY KEYTABLE(0:N-1); |
|||
REF(HASHVAL) ARRAY VALTABLE(0:N-1); |
|||
INTEGER SIZE; |
|||
END INNERHASHMAP; |
|||
PROCEDURE PUT(K,V); REF(HASHKEY) K; REF(HASHVAL) V; |
|||
BEGIN |
|||
IF IMAP.SIZE >= 0.75 * IMAP.N THEN |
|||
BEGIN |
|||
COMMENT RESIZE HASHMAP ; |
|||
REF(INNERHASHMAP) NEWIMAP; |
|||
REF(ITERATOR) IT; |
|||
NEWIMAP :- NEW INNERHASHMAP(2 * IMAP.N); |
|||
IT :- NEW ITERATOR(THIS HASHMAP); |
|||
WHILE IT.MORE DO |
|||
BEGIN |
|||
REF(HASHKEY) KEY; |
|||
KEY :- IT.NEXT; |
|||
NEWIMAP.PUT(KEY, IMAP.GET(KEY)); |
|||
END; |
|||
IMAP.CLEAR; |
|||
IMAP :- NEWIMAP; |
|||
END; |
|||
IMAP.PUT(K, V); |
|||
END; |
|||
REF(HASHVAL) PROCEDURE GET(K); REF(HASHKEY) K; |
|||
GET :- IMAP.GET(K); |
|||
PROCEDURE CLEAR; |
|||
IMAP.CLEAR; |
|||
INTEGER PROCEDURE SIZE; |
|||
SIZE := IMAP.SIZE; |
|||
REF(INNERHASHMAP) IMAP; |
|||
IMAP :- NEW INNERHASHMAP(16); |
|||
END HASHMAP; |
|||
CLASS ITERATOR(H); REF(HASHMAP) H; |
|||
BEGIN |
|||
INTEGER POS,KEYCOUNT; |
|||
BOOLEAN PROCEDURE MORE; |
|||
MORE := KEYCOUNT < H.SIZE; |
|||
REF(HASHKEY) PROCEDURE NEXT; |
|||
BEGIN |
|||
INSPECT H.IMAP DO |
|||
BEGIN |
|||
WHILE KEYTABLE(POS) == NONE DO |
|||
POS := POS+1; |
|||
NEXT :- KEYTABLE(POS); |
|||
KEYCOUNT := KEYCOUNT+1; |
|||
POS := POS+1; |
|||
END; |
|||
END NEXT; |
|||
END ITERATOR; |
|||
COMMENT ----- PROBLEM SPECIFIC CLASSES ; |
|||
HASHKEY CLASS TEXTHASHKEY(T); VALUE T; TEXT T; |
|||
BEGIN |
|||
INTEGER PROCEDURE HASH; |
|||
BEGIN |
|||
INTEGER I; |
|||
T.SETPOS(1); |
|||
WHILE T.MORE DO |
|||
I := 31*I+RANK(T.GETCHAR); |
|||
HASH := I; |
|||
END HASH; |
|||
BOOLEAN PROCEDURE EQUALTO(K); REF(HASHKEY) K; |
|||
EQUALTO := T = K QUA TEXTHASHKEY.T; |
|||
END TEXTHASHKEY; |
|||
HASHVAL CLASS COUNTER; |
|||
BEGIN |
|||
INTEGER COUNT; |
|||
END COUNTER; |
|||
REF(INFILE) INF; |
|||
REF(HASHMAP) MAP; |
|||
REF(TEXTHASHKEY) KEY; |
|||
REF(COUNTER) VAL; |
|||
REF(ITERATOR) IT; |
|||
TEXT LINE, WORD; |
|||
INTEGER I, J, MAXCOUNT, LINENO; |
|||
INTEGER ARRAY MAXCOUNTS(1:10); |
|||
REF(TEXTHASHKEY) ARRAY MAXWORDS(1:10); |
|||
WORD :- BLANKS(1000); |
|||
MAP :- NEW HASHMAP; |
|||
COMMENT MAP WORDS TO COUNTERS ; |
|||
INF :- NEW INFILE("135-0.txt"); |
|||
INF.OPEN(BLANKS(4096)); |
|||
WHILE NOT INF.LASTITEM DO |
|||
BEGIN |
|||
BOOLEAN INWORD; |
|||
PROCEDURE SAVE; |
|||
BEGIN |
|||
IF WORD.POS > 1 THEN |
|||
BEGIN |
|||
KEY :- NEW TEXTHASHKEY(WORD.SUB(1, WORD.POS - 1)); |
|||
VAL :- MAP.GET(KEY); |
|||
IF VAL == NONE THEN |
|||
BEGIN |
|||
VAL :- NEW COUNTER; |
|||
MAP.PUT(KEY, VAL); |
|||
END; |
|||
VAL.COUNT := VAL.COUNT + 1; |
|||
WORD := " "; |
|||
WORD.SETPOS(1); |
|||
END; |
|||
END SAVE; |
|||
LINENO := LINENO + 1; |
|||
LINE :- COPY(INF.IMAGE).STRIP; INF.INIMAGE; |
|||
COMMENT SEARCH WORDS IN LINE ; |
|||
COMMENT A WORD IS ANY SEQUENCE OF LETTERS ; |
|||
INWORD := FALSE; |
|||
LINE.SETPOS(1); |
|||
WHILE LINE.MORE DO |
|||
BEGIN |
|||
CHARACTER CH; |
|||
CH := LINE.GETCHAR; |
|||
IF CH >= 'a' AND CH <= 'z' THEN |
|||
CH := CHAR(RANK(CH) - RANK('a') + RANK('A')); |
|||
IF CH >= 'A' AND CH <= 'Z' THEN |
|||
BEGIN |
|||
IF NOT INWORD THEN |
|||
BEGIN |
|||
SAVE; |
|||
INWORD := TRUE; |
|||
END; |
|||
WORD.PUTCHAR(CH); |
|||
END ELSE |
|||
BEGIN |
|||
IF INWORD THEN |
|||
BEGIN |
|||
SAVE; |
|||
INWORD := FALSE; |
|||
END; |
|||
END; |
|||
END; |
|||
END; |
|||
INF.CLOSE; |
|||
COMMENT FIND 10 MOST COMMON WORDS ; |
|||
IT :- NEW ITERATOR(MAP); |
|||
WHILE IT.MORE DO |
|||
BEGIN |
|||
KEY :- IT.NEXT; |
|||
VAL :- MAP.GET(KEY); |
|||
FOR I := 1 STEP 1 UNTIL 10 DO |
|||
BEGIN |
|||
IF VAL.COUNT >= MAXCOUNTS(I) THEN |
|||
BEGIN |
|||
FOR J := 10 STEP -1 UNTIL I + 1 DO |
|||
BEGIN |
|||
MAXCOUNTS(J) := MAXCOUNTS(J - 1); |
|||
MAXWORDS(J) :- MAXWORDS(J - 1); |
|||
END; |
|||
MAXCOUNTS(I) := VAL.COUNT; |
|||
MAXWORDS(I) :- KEY; |
|||
GO TO BREAK; |
|||
END; |
|||
END; |
|||
BREAK: |
|||
END; |
|||
COMMENT OUTPUT 10 MOST COMMON WORDS ; |
|||
FOR I := 1 STEP 1 UNTIL 10 DO |
|||
BEGIN |
|||
OUTINT(MAXCOUNTS(I), 10); |
|||
OUTTEXT(" "); |
|||
OUTTEXT(MAXWORDS(I) QUA TEXTHASHKEY.T); |
|||
OUTIMAGE; |
|||
END; |
|||
END |
|||
</lang> |
|||
{{out}} |
|||
<pre> |
|||
41089 THE |
|||
19949 OF |
|||
14942 AND |
|||
14608 A |
|||
13951 TO |
|||
11214 IN |
|||
9648 HE |
|||
8621 WAS |
|||
7924 THAT |
|||
6661 IT |
|||
6 garbage collection(s) in 0.2 seconds. |
|||
</pre> |
</pre> |
||
Revision as of 13:57, 17 August 2017
- Task
Given a text file and an integer n, print the n most common words in the file (and the number of their occurrences) in decreasing frequency.
For the purposes of this task:
- A word is a sequence of one or more contiguous letters
- Uppercase letters are considered equivalent to their lowercase counterparts
- Words of equal frequency can be listed in any order
Show example output using Les Misérables from Project Gutenberg
as the text file input and display the top 10 most used words.
- History
This task was originally taken from programming pearls from Communications of the ACM June 1986 Volume 29 Number 6
where this problem is solved by Donald Knuth using literate programming and then critiqued by Doug McIlroy,
demonstrating solving the problem in a 6 line Unix shell script.
Clojure
<lang clojure>(defn count-words [file n]
(->> file slurp clojure.string/lower-case (re-seq #"\w+") frequencies (sort-by val >) (take n)))</lang>
- Output:
user=> (count-words "135-0.txt" 10) (["the" 41036] ["of" 19946] ["and" 14940] ["a" 14589] ["to" 13939] ["in" 11204] ["he" 9645] ["was" 8619] ["that" 7922] ["it" 6659])
Kotlin
The author of the Perl 6 entry has given a good account of the difficulties with this task and, in the absence of any clarification on the various issues, I've followed a similar 'literal' approach.
So, after first converting the text to lower case, I've assumed that a word is any sequence of one or more lower-case Unicode letters and obtained the same results as the Perl 6 version.
There is no change in the results if the numerals 0-9 are also regarded as letters. <lang scala>// version 1.1.3
import java.io.File
fun main(args: Array<String>) {
val text = File("135-0.txt").readText().toLowerCase() val r = Regex("""\p{javaLowerCase}+""") val matches = r.findAll(text) val wordGroups = matches.map { it.value } .groupBy { it } .map { Pair(it.key, it.value.size) } .sortedByDescending { it.second } .take(10) println("Rank Word Frequency") println("==== ==== =========") var rank = 1 for ((word, freq) in wordGroups) System.out.printf("%2d %-4s %5d\n", rank++, word, freq)
}</lang>
- Output:
Rank Word Frequency ==== ==== ========= 1 the 41088 2 of 19949 3 and 14942 4 a 14596 5 to 13951 6 in 11214 7 he 9648 8 was 8621 9 that 7924 10 it 6661
Perl 6
This is slightly trickier than it appears initially. The task specifically states: "A word is a sequence of one or more contiguous letters", so contractions and hyphenated words are broken up. Initially we might reach for a regex matcher like /\w+/ , but \w includes underscore, which is not a letter but a punctuation connector; and this text is full of underscores since that is how Project Gutenberg texts denote italicized text. The underscores are not actually parts of the words though, they are markup.
We might try /A-Za-z/ as a matcher but this text is bursting with French words containing various accented glyphs. Those are letters, so words will be incorrectly split up; (Misérables will be counted as 'mis' and 'rables', probably not what we want.)
Actually, in this case /A-Za-z/ returns very nearly the correct answer. Unfortunately, the name "Alèthe" appears once (only once!) in the text, gets incorrectly split into Al & the, and incorrectly reports 41089 occurrences of "the". The text has several words like "Panathenæa", "ça", "aérostiers" and "Keksekça" so the counts for 'a' are off too. The other 8 of the top 10 are "correct" using /A-Za-z/, but it is mostly by accident. The more accurate regex matcher is some kind of Unicode aware /\w/ minus underscore.
( Really, a better regex would allow for contractions and embedded apostrophes but that is beyond the scope of this task as it stands. There are words like cat-o'-nine-tails and will-o'-the-wisps in there too to make your day even more interesting. )
<lang perl6>sub MAIN ($filename, $top = 10) {
.say for ($filename.IO.slurp.lc ~~ m:g/[<[\w]-[_]>]+/)».Str.Bag.sort(-*.value)[^$top]
}</lang>
- Output:
Passing in the file name and 10:
the => 41088 of => 19949 and => 14942 a => 14596 to => 13951 in => 11214 he => 9648 was => 8621 that => 7924 it => 6661
Python
Python2.7
<lang python>import collections import re import string import sys
def main():
counter = collections.Counter(re.findall(r"\w+",open(sys.argv[1]).read().lower())) print counter.most_common(int(sys.argv[2]))
if __name__ == "__main__":
main()</lang>
- Output:
$ python wordcount.py 135-0.txt 10 [('the', 41036), ('of', 19946), ('and', 14940), ('a', 14589), ('to', 13939), ('in', 11204), ('he', 9645), ('was', 8619), ('that', 7922), ('it', 6659)]
Python3.6
<lang python>from collections import Counter from re import findall
les_mis_file = 'les_mis_135-0.txt'
def _count_words(fname):
with open(fname) as f: text = f.read() words = findall(r'\w+', text.lower()) return Counter(words)
def most_common_words_in_file(fname, n):
counts = _count_words(fname) for word, count in 'WORD', 'COUNT' + counts.most_common(n): print(f'{word:>10} {count:>6}')
if __name__ == "__main__":
n = int(input('How many?: ')) most_common_words_in_file(les_mis_file, n)</lang>
- Output:
How many?: 10 WORD COUNT the 41036 of 19946 and 14940 a 14586 to 13939 in 11204 he 9645 was 8619 that 7922 it 6659
Racket
<lang racket>#lang racket
(define (all-words f (case-fold string-downcase))
(map case-fold (regexp-match* #px"\\w+" (file->string f))))
(define (l.|l| l) (cons (car l) (length l)))
(define (counts l (>? >)) (sort (map l.|l| (group-by values l)) >? #:key cdr))
(module+ main
(take (counts (all-words "data/les-mis.txt")) 10))</lang>
- Output:
'(("the" . 41036) ("of" . 19946) ("and" . 14940) ("a" . 14589) ("to" . 13939) ("in" . 11204) ("he" . 9645) ("was" . 8619) ("that" . 7922) ("it" . 6659))
REXX
This REXX version doesn't need to sort the list of words.
Currently, this version treats accented (non-Latin) letters as non-letters. Additional support of accented letters is waiting for clarification from the task's author. <lang rexx>/*REXX program reads and displays a count of words a file. Word case is ignored.*/ parse arg fID top . /*obtain optional arguments from the CL*/ if fID== | fID=="," then fID= 'les_mes.TXT' /*None specified? Then use the default.*/ if top== | top=="," then top= 10 /* " " " " " " */ c=0; @.=0 /*initialize word list; word count. */ !.= /* " the original word instance*/
do #=1 while lines(fID)\==0 /*loop whilst there are lines in file. */ y=space( linein(fID) ) /*remove superfluous blanks in the line*/ $= /*$: is a list of words in this line. */ do j=1 for length(y); _=substr(y,j,1) /*obtain a character of the word found.*/ if datatype(_, 'M') then $=$ || _ /*Is it a letter? Append to $. */ else $=$ || ' ' /*Is it not a letter? Append blank. */ end /*j*/ $=strip($) /*strip any leading and trailing blanks*/ do while $\=; parse var $ z $ /*now, process each word in the $ list.*/ oz=z; upper z /*obtain an uppercase version of word. */ if @.z==0 then do; c=c+1; !.c=z; end /*bump the word#; assign word to array*/ @@.z=oz /*save the original case of the word. */ @.z=@.z + 1 /*bump the count of occurrences of word*/ end /*while*/ end /*#*/
say right('word',40) " " center(' rank ',6) " count " /*display a title for output*/ say right('════',40) " " center('══════',6) "═══════" /* " a title separator.*/
do tops=1 by 0 until tops>top; mc=0 /*process enough words to satisfy TOP.*/ tl= /*initialize (possibly) a list of words*/ do n=1 for c; z=!.n; count=@.z /*process the list of words in the file*/ if count<1 then iterate /*Is count too small? Then ignore it.*/ z=!.n /*get the name of the capitalized word.*/ if count==mc then tl=tl z /*handle cases of tied number of words.*/ if count>mc then do; mc=count /*this word count is the current max. */ tl=z /* " word " " " " */ end end /*n*/ w=0 /*will be the maximum length of count. */ wr=max( length(' rank '), length(top) ) /*find the maximum length of the rank #*/ do d=1 for words(tl); _=word(tl, d) if d==1 then w=max(8, length(@._)) /*use the length of the first word used*/ say right(@@._, 40 ) right(tops, wr) right(@._, w) @._=0 /*nullify this word count for next time*/ end /*d*/ tops=tops + words(tl) /*correctly handle the tied rankings. */ end /*tops*/ /*stick a fork in it, we're all done. */</lang>
- output when using the default inputs:
This output agrees with UNIX Shell.
word rank count ════ ══════ ═══════ the 1 41089 of 2 19949 and 3 14942 a 4 14608 to 5 13951 in 6 11214 he 7 9648 was 8 8621 that 9 7924 it 10 6661
Simula
<lang simula>COMMENT COMPILE WITH $ cim -m64 word-count.sim
BEGIN
COMMENT ----- CLASSES FOR GENERAL USE ;
! ABSTRACT HASH KEY TYPE ; CLASS HASHKEY; VIRTUAL: PROCEDURE HASH IS INTEGER PROCEDURE HASH;; PROCEDURE EQUALTO IS BOOLEAN PROCEDURE EQUALTO(K); REF(HASHKEY) K;; BEGIN END HASHKEY;
! ABSTRACT HASH VALUE TYPE ; CLASS HASHVAL; BEGIN ! THERE IS NOTHING REQUIRED FOR THE VALUE TYPE ; END HASHVAL;
CLASS HASHMAP; BEGIN CLASS INNERHASHMAP(N); INTEGER N; BEGIN
INTEGER PROCEDURE INDEX(K); REF(HASHKEY) K; BEGIN INTEGER I; IF K == NONE THEN ERROR("HASHMAP.INDEX: NONE IS NOT A VALID KEY"); I := MOD(K.HASH,N); LOOP: IF KEYTABLE(I) == NONE OR ELSE KEYTABLE(I).EQUALTO(K) THEN INDEX := I ELSE BEGIN I := IF I+1 = N THEN 0 ELSE I+1; GO TO LOOP; END; END INDEX;
! PUT SOMETHING IN ; PROCEDURE PUT(K,V); REF(HASHKEY) K; REF(HASHVAL) V; BEGIN INTEGER I; IF V == NONE THEN ERROR("HASHMAP.PUT: NONE IS NOT A VALID VALUE"); I := INDEX(K); IF KEYTABLE(I) == NONE THEN BEGIN IF SIZE = N THEN ERROR("HASHMAP.PUT: TABLE FILLED COMPLETELY"); KEYTABLE(I) :- K; VALTABLE(I) :- V; SIZE := SIZE+1; END ELSE VALTABLE(I) :- V; END PUT;
! GET SOMETHING OUT ; REF(HASHVAL) PROCEDURE GET(K); REF(HASHKEY) K; BEGIN INTEGER I; IF K == NONE THEN ERROR("HASHMAP.GET: NONE IS NOT A VALID KEY"); I := INDEX(K); IF KEYTABLE(I) == NONE THEN GET :- NONE ! ERROR("HASHMAP.GET: KEY NOT FOUND"); ELSE GET :- VALTABLE(I); END GET;
PROCEDURE CLEAR; BEGIN INTEGER I; FOR I := 0 STEP 1 UNTIL N-1 DO BEGIN KEYTABLE(I) :- NONE; VALTABLE(I) :- NONE; END; SIZE := 0; END CLEAR;
! DATA MEMBERS OF CLASS HASHMAP ; REF(HASHKEY) ARRAY KEYTABLE(0:N-1); REF(HASHVAL) ARRAY VALTABLE(0:N-1); INTEGER SIZE;
END INNERHASHMAP;
PROCEDURE PUT(K,V); REF(HASHKEY) K; REF(HASHVAL) V; BEGIN IF IMAP.SIZE >= 0.75 * IMAP.N THEN BEGIN COMMENT RESIZE HASHMAP ; REF(INNERHASHMAP) NEWIMAP; REF(ITERATOR) IT; NEWIMAP :- NEW INNERHASHMAP(2 * IMAP.N); IT :- NEW ITERATOR(THIS HASHMAP); WHILE IT.MORE DO BEGIN REF(HASHKEY) KEY; KEY :- IT.NEXT; NEWIMAP.PUT(KEY, IMAP.GET(KEY)); END; IMAP.CLEAR; IMAP :- NEWIMAP; END; IMAP.PUT(K, V); END;
REF(HASHVAL) PROCEDURE GET(K); REF(HASHKEY) K; GET :- IMAP.GET(K);
PROCEDURE CLEAR; IMAP.CLEAR;
INTEGER PROCEDURE SIZE; SIZE := IMAP.SIZE;
REF(INNERHASHMAP) IMAP;
IMAP :- NEW INNERHASHMAP(16); END HASHMAP;
CLASS ITERATOR(H); REF(HASHMAP) H; BEGIN INTEGER POS,KEYCOUNT;
BOOLEAN PROCEDURE MORE; MORE := KEYCOUNT < H.SIZE;
REF(HASHKEY) PROCEDURE NEXT; BEGIN INSPECT H.IMAP DO BEGIN WHILE KEYTABLE(POS) == NONE DO POS := POS+1; NEXT :- KEYTABLE(POS); KEYCOUNT := KEYCOUNT+1; POS := POS+1; END; END NEXT;
END ITERATOR;
COMMENT ----- PROBLEM SPECIFIC CLASSES ;
HASHKEY CLASS TEXTHASHKEY(T); VALUE T; TEXT T; BEGIN INTEGER PROCEDURE HASH; BEGIN INTEGER I; T.SETPOS(1); WHILE T.MORE DO I := 31*I+RANK(T.GETCHAR); HASH := I; END HASH; BOOLEAN PROCEDURE EQUALTO(K); REF(HASHKEY) K; EQUALTO := T = K QUA TEXTHASHKEY.T; END TEXTHASHKEY;
HASHVAL CLASS COUNTER; BEGIN INTEGER COUNT; END COUNTER;
REF(INFILE) INF; REF(HASHMAP) MAP; REF(TEXTHASHKEY) KEY; REF(COUNTER) VAL; REF(ITERATOR) IT; TEXT LINE, WORD; INTEGER I, J, MAXCOUNT, LINENO; INTEGER ARRAY MAXCOUNTS(1:10); REF(TEXTHASHKEY) ARRAY MAXWORDS(1:10);
WORD :- BLANKS(1000); MAP :- NEW HASHMAP; COMMENT MAP WORDS TO COUNTERS ;
INF :- NEW INFILE("135-0.txt"); INF.OPEN(BLANKS(4096)); WHILE NOT INF.LASTITEM DO BEGIN BOOLEAN INWORD;
PROCEDURE SAVE; BEGIN IF WORD.POS > 1 THEN BEGIN KEY :- NEW TEXTHASHKEY(WORD.SUB(1, WORD.POS - 1)); VAL :- MAP.GET(KEY); IF VAL == NONE THEN BEGIN VAL :- NEW COUNTER; MAP.PUT(KEY, VAL); END; VAL.COUNT := VAL.COUNT + 1; WORD := " "; WORD.SETPOS(1); END; END SAVE;
LINENO := LINENO + 1; LINE :- COPY(INF.IMAGE).STRIP; INF.INIMAGE;
COMMENT SEARCH WORDS IN LINE ; COMMENT A WORD IS ANY SEQUENCE OF LETTERS ;
INWORD := FALSE; LINE.SETPOS(1); WHILE LINE.MORE DO BEGIN CHARACTER CH; CH := LINE.GETCHAR; IF CH >= 'a' AND CH <= 'z' THEN CH := CHAR(RANK(CH) - RANK('a') + RANK('A')); IF CH >= 'A' AND CH <= 'Z' THEN BEGIN IF NOT INWORD THEN BEGIN SAVE; INWORD := TRUE; END; WORD.PUTCHAR(CH); END ELSE BEGIN IF INWORD THEN BEGIN SAVE; INWORD := FALSE; END; END; END; END; INF.CLOSE;
COMMENT FIND 10 MOST COMMON WORDS ;
IT :- NEW ITERATOR(MAP); WHILE IT.MORE DO BEGIN KEY :- IT.NEXT; VAL :- MAP.GET(KEY); FOR I := 1 STEP 1 UNTIL 10 DO BEGIN IF VAL.COUNT >= MAXCOUNTS(I) THEN BEGIN FOR J := 10 STEP -1 UNTIL I + 1 DO BEGIN MAXCOUNTS(J) := MAXCOUNTS(J - 1); MAXWORDS(J) :- MAXWORDS(J - 1); END; MAXCOUNTS(I) := VAL.COUNT; MAXWORDS(I) :- KEY; GO TO BREAK; END; END; BREAK: END;
COMMENT OUTPUT 10 MOST COMMON WORDS ;
FOR I := 1 STEP 1 UNTIL 10 DO BEGIN OUTINT(MAXCOUNTS(I), 10); OUTTEXT(" "); OUTTEXT(MAXWORDS(I) QUA TEXTHASHKEY.T); OUTIMAGE; END;
END </lang>
- Output:
41089 THE 19949 OF 14942 AND 14608 A 13951 TO 11214 IN 9648 HE 8621 WAS 7924 THAT 6661 IT 6 garbage collection(s) in 0.2 seconds.
UNIX Shell
<lang bash>#!/bin/sh cat ${1} | tr -cs A-Za-z '\n' | tr A-Z a-z | sort | uniq -c | sort -rn | sed ${2}q</lang>
- Output:
$ ./wordcount.sh 135-0.txt 10 41089 the 19949 of 14942 and 14608 a 13951 to 11214 in 9648 he 8621 was 7924 that 6661 it