Inverted index
You are encouraged to solve this task according to the task description, using any language you may know.
An Inverted Index is a data structure used to create full text search.
Given a set of text files, implement a program to create an inverted index. Also create a user interface to do a search using that inverted index which returns a list of files that contain the query term / terms. The search index can be in memory.
AutoHotkey
<lang AutoHotkey>; http://www.autohotkey.com/forum/viewtopic.php?t=41479 inputbox, files, files, file pattern such as c:\files\*.txt
word2docs := object() ; autohotkey_L is needed.
stime := A_tickcount Loop, %files%, 0,1 {
tooltip,%A_index% / 500 wordList := WordsIn(A_LoopFileFullPath) InvertedIndex(wordList, A_loopFileFullpath)
}
tooltip msgbox, % "total time " (A_tickcount-stime)/1000
gosub, search return
search: Loop {
InputBox, keyword , input single keyword only msgbox, % foundDocs := findword(keyword)
} return
WordsIn(docpath) {
FileRead, content, %docpath% spos = 1 Loop { if !(spos := Regexmatch(content, "[a-zA-Z]{2,}",match, spos)) break spos += strlen(match) this_wordList .= match "`n" } Sort, this_wordList, U return this_wordList
}
InvertedIndex(byref words, docpath) {
global word2docs
loop, parse, words, `n,`r { if A_loopField = continue word2docs[A_loopField] := word2docs[A_loopField] docpath "`n" }
}
findWord(word2find) {
global word2docs
if (word2docs[word2find] = "") return "" else return word2docs[word2find]
}</lang>
C
The code is stupidly long, having to implement a Trie to store strings and all -- the program doesn't do anything shiny, but Tries may be interesting to look at. <lang C>#include <stdio.h>
- include <stdlib.h>
char chr_legal[] = "abcdefghijklmnopqrstuvwxyz0123456789_-./"; int chr_idx[256] = {0}; char idx_chr[256] = {0};
- define FNAME 0
typedef struct trie_t *trie, trie_t; struct trie_t { trie next[sizeof(chr_legal)]; /* next letter; slot 0 is for file name */ int eow; };
trie trie_new() { return calloc(sizeof(trie_t), 1); }
- define find_word(r, w) trie_trav(r, w, 1)
/* tree traversal: returns node if end of word and matches string, optionally
* create node if doesn't exist */
trie trie_trav(trie root, char * str, int no_create) { int c; while (root) { if ((c = str[0]) == '\0') { if (!root->eow && no_create) return 0; break; } if (! (c = chr_idx[c]) ) { str++; continue; }
if (!root->next[c]) { if (no_create) return 0; root->next[c] = trie_new(); } root = root->next[c]; str++; } return root; }
/* complete traversal of whole tree, calling callback at each end of word node.
* similar method can be used to free nodes, had we wanted to do that. */
int trie_all(trie root, char path[], int depth, int (*callback)(char *)) { int i; if (root->eow && !callback(path)) return 0;
for (i = 1; i < sizeof(chr_legal); i++) { if (!root->next[i]) continue;
path[depth] = idx_chr[i]; path[depth + 1] = '\0'; if (!trie_all(root->next[i], path, depth + 1, callback)) return 0; } return 1; }
void add_index(trie root, char *word, char *fname) { trie x = trie_trav(root, word, 0); x->eow = 1;
if (!x->next[FNAME]) x->next[FNAME] = trie_new(); x = trie_trav(x->next[FNAME], fname, 0); x->eow = 1; }
int print_path(char *path) { printf(" %s", path); return 1; }
/* pretend we parsed text files and got lower cased words: dealing *
* with text file is a whole other animal and would make code too long */
char *files[] = { "f1.txt", "source/f2.txt", "other_file" }; char *text[][5] ={{ "it", "is", "what", "it", "is" }, { "what", "is", "it", 0 }, { "it", "is", "a", "banana", 0 }};
trie init_tables() { int i, j; trie root = trie_new(); for (i = 0; i < sizeof(chr_legal); i++) { chr_idx[(int)chr_legal[i]] = i + 1; idx_chr[i + 1] = chr_legal[i]; }
for (i = 0; i < 3; i++) { for (j = 0; j < 5; j++) { if (!text[i][j]) break; add_index(root, text[i][j], files[i]); } } return root; }
void search_index(trie root, char *word) { char path[1024]; printf("Search for \"%s\": ", word); trie found = find_word(root, word);
if (!found) printf("not found\n"); else { trie_all(found->next[FNAME], path, 0, print_path); printf("\n"); } }
int main() { trie root = init_tables();
search_index(root, "what"); search_index(root, "is"); search_index(root, "banana"); search_index(root, "boo"); return 0; }</lang>Output:<lang>Search for "what": f1.txt source/f2.txt Search for "is": f1.txt other_file source/f2.txt Search for "banana": other_file Search for "boo": not found</lang>
C#
<lang csharp>using System; using System.Collections.Generic; using System.IO; using System.Linq;
class InvertedIndex {
static Dictionary<TItem, IEnumerable<TKey>> Invert<TKey, TItem>(Dictionary<TKey, IEnumerable<TItem>> dictionary) { return dictionary .SelectMany(keyValuePair => keyValuePair.Value.Select(item => new KeyValuePair<TItem, TKey>(item, keyValuePair.Key))) .GroupBy(keyValuePair => keyValuePair.Key) .ToDictionary(group => group.Key, group => group.Select(keyValuePair => keyValuePair.Value)); }
static void Main() { Console.Write("files: "); var files = Console.ReadLine(); Console.Write("find: "); var find = Console.ReadLine(); var dictionary = files.Split().ToDictionary(file => file, file => File.ReadAllText(file).Split().AsEnumerable()); Console.WriteLine("{0} found in: {1}", find, string.Join(" ", Invert(dictionary)[find])); }
}</lang> Sample output: <lang>files: file1 file2 file3 find: what what found in: file1 file2</lang>
Factor
<lang factor>USING: assocs fry io.encodings.utf8 io.files kernel sequences sets splitting vectors ; IN: rosettacode.inverted-index
- file-words ( file -- assoc )
utf8 file-contents " ,;:!?.()[]{}\n\r" split harvest ;
- add-to-file-list ( files file -- files )
over [ swap [ adjoin ] keep ] [ nip 1vector ] if ;
- add-to-index ( words index file -- )
'[ _ [ _ add-to-file-list ] change-at ] each ;
- (index-files) ( files index -- )
[ [ [ file-words ] keep ] dip swap add-to-index ] curry each ;
- index-files ( files -- index )
H{ } clone [ (index-files) ] keep ;
- query ( terms index -- files )
[ at ] curry map [ ] [ intersect ] map-reduce ;
</lang>
Example use : <lang>( scratchpad ) { "f1" "f2" "f3" } index-files
--- Data stack: H{ { "a" ~vector~ } { "is" ~vector~ } { "what" ~vector~ } { ... ( scratchpad ) { "what" "is" "it" } swap query . V{ "f1" "f2" } </lang>
F#
<lang fsharp>open System open System.IO
// Map search terms to associated set of files type searchIndexMap = Map<string, Set<string>>
let inputSearchCriteria() =
let readLine prompt = printf "%s: " prompt Console.ReadLine().Split() readLine "Files", (readLine "Find") |> Array.map (fun s -> s.ToLower())
let updateIndex indexMap keyValuePair =
let k, v = keyValuePair match Map.tryFind k indexMap with | None -> Map.add k (Set.singleton v) indexMap | Some set -> Map.add k (Set.add v set) indexMap
let buildIndex files =
let fileData file = File.ReadAllText(file).Split() |> Seq.map (fun word -> word.ToLower(), file) files |> Seq.collect fileData |> Seq.fold updateIndex Map.empty
let searchFiles() =
let files, terms = inputSearchCriteria() let indexes = buildIndex files
let searchResults = terms |> Seq.map (fun term -> Map.find term indexes) |> Set.intersectMany
printf "Found in: " ; searchResults |> Set.iter (printf "%s ") ; printfn ""</lang>
Sample usage:
searchFiles() Files: file1.txt file2.txt file3.txt Find: what is Found in: file1.txt file2.txt
Go
<lang go>package main
import (
"bytes" "encoding/line" "fmt" "os"
)
// inverted index representation var index map[string][]int // ints index into indexed var indexed []doc
type doc struct {
file string title string
}
func main() {
// initialize representation index = make(map[string][]int)
// build index if err := indexDir("docs"); err != nil { fmt.Println(err) return }
// run user interface ui()
}
func indexDir(dir string) os.Error {
df, err := os.Open(dir) if err != nil { return err } fis, err := df.Readdir(-1) if err != nil { return err } if len(fis) == 0 { return os.NewError(fmt.Sprintf("no files in %s", dir)) } indexed := 0 for _, fi := range fis { if fi.IsRegular() { if indexFile(dir + "/" + fi.Name) { indexed++ } } } return nil
}
func indexFile(fn string) bool {
f, err := os.Open(fn) if err != nil { fmt.Println(err) return false // only false return }
// register new file x := len(indexed) indexed = append(indexed, doc{fn, fn}) pdoc := &indexed[x]
// scan lines r := line.NewReader(f, 4096) lines := 0 for { b, isPrefix, err := r.ReadLine() switch { case err == os.EOF: return true case err != nil: fmt.Println(err) return true case isPrefix: fmt.Printf("%s: unexpeced long line\n", fn) return true case lines < 20 && bytes.HasPrefix(b, []byte("Title:")): // in a real program you would write code // to skip the Gutenberg document header // and not index it. pdoc.title = string(b[7:]) } // index line of text in b // again, in a real program you would write a much // nicer word splitter. wordLoop: for _, bword := range bytes.Fields(b) { bword := bytes.Trim(bword, ".,-~?!\"'`;:()<>[]{}\\|/=_+*&^%$#@") if len(bword) > 0 { word := string(bword) dl := index[word] for _, d := range dl { if d == x { continue wordLoop } } index[word] = append(dl, x) } } } return true
}
func ui() {
fmt.Println(len(index), "words indexed in", len(indexed), "files") fmt.Println("enter single words to search for") fmt.Println("enter a blank line when done") var word string for { fmt.Print("search word: ") wc, _ := fmt.Scanln(&word) if wc == 0 { return } switch dl := index[word]; len(dl) { case 0: fmt.Println("no match") case 1: fmt.Println("one match:") fmt.Println(" ", indexed[dl[0]].file, indexed[dl[0]].title) default: fmt.Println(len(dl), "matches:") for _, d := range dl { fmt.Println(" ", indexed[d].file, indexed[d].title) } } }
}</lang> Session:
8447 words indexed in 11 files enter single words to search for enter a blank line when done search word: dog no match search word: cat one match: docs/pg28554.txt Beyond Lies the Wub search word: robot 6 matches: docs/pg32032.txt Second Variety docs/pg32522.txt Mr. Spaceship docs/pg32832.txt Piper in the Woods docs/pg28698.txt The Crystal Crypt docs/pg28767.txt The Defenders docs/pg32154.txt The Variable Man
Haskell
<lang haskell>import Control.Monad import Data.Char (isAlpha, toLower) import qualified Data.Map as M import qualified Data.IntSet as S import System.Environment (getArgs)
main = do
(files, _ : q) <- liftM (break (== "--")) getArgs buildII files >>= mapM_ putStrLn . queryII q
data IIndex = IIndex
[FilePath] -- Files in the index (M.Map String S.IntSet) -- Maps word to indices of the list deriving Show
buildII :: [FilePath] -> IO IIndex buildII files =
liftM (IIndex files . foldl f M.empty . zip [0..]) $ mapM readFile files where f m (i, s) = foldl g m $ map (lowercase . filter isAlpha) $ words s where g m word = M.insertWith S.union word (S.singleton i) m
queryII :: [String] -> IIndex -> [FilePath] queryII q (IIndex files m) =
map (files !!) $ S.toList $ intersections $ map (\word -> M.findWithDefault S.empty (lowercase word) m) q
intersections [] = S.empty intersections xs = foldl1 S.intersection xs
lowercase = map toLower</lang>
An example of use, assuming the program is named iindex
and there exist files t0
, t1
, and t2
with contents "It is what it is.", "What is it?", and "It is a banana.":
$ iindex t0 t1 t2 -- what is it t0 t1
Icon and Unicon
The following implements a simple case insensitive inverse index using lists simulating texts. <lang Icon>procedure main()
texts := table() # substitute for read and parse files texts["T0.txt"] := ["it", "is", "what", "it", "is"] texts["T1.txt"] := ["what", "is", "it"] texts["T2.txt"] := ["it", "is", "a", "banana"]
every textname := key(texts) do # build index for each 'text' SII := InvertedIndex(SII,textname,texts[textname]) TermSearchUI(SII) # search UI
end
procedure InvertedIndex(ii,k,words) #: accumulate a simple inverted index
/ii := table(set()) # create lookup table and null set every w := !words do {
if *ii[w] = 0 then ii[w] := set() # new word, new set insert(ii[w],k) }
return ii end
procedure TermSearchUI(ii) #: search UI, all words must match
repeat {
writes("Enter search terms (^z to quit) : ") terms := map(trim(read() | break))
x := [] terms ? while not pos(0) do { tab(many(' \t')) put(x,tab(upto('\ \t')|0)) } show("Searching for : ",x) show("Found in : ",s := TermSearch(ii,x)) | show("Not found : ",x) }
write("End of search") return end
procedure TermSearch(ii,x) #: return set of matches or fail every s := !x do
( /u := ii[s] ) | (u **:= ii[s])
if *u > 0 then return u end
procedure show(s,x) # display helper every writes(s|!x) do writes(" ") write() return end</lang>
Output:
Enter search terms (^z to quit) : is it Searching for : is it Found in : T0.txt T2.txt T1.txt Enter search terms (^z to quit) : banana Searching for : banana Found in : T2.txt Enter search terms (^z to quit) : fox Searching for : fox Not found : fox Enter search terms (^z to quit) : what Searching for : what Found in : T0.txt T1.txt
The following code will build a full index. Modification of search routines is left as an exercise: <lang Unicon>record InvertedIndexRec(simple,full)
procedure FullInvertedIndex(ii,k,words) #: accumulate a full inverted index
/ii := InvertedIndexRec( table(set()), table() ) # create lookup table and null set
wc := 0 every (w := !words, wc +:= 1) do {
if *ii.simple[w] = 0 then { ii.simple[w] := set() # new word, new set ii.full[w] := table() # also new table } insert(ii.simple[w],k) /ii.full[w,k] := set() insert(ii.full[w,k],wc) }
return ii end</lang>
J
This just implements the required spec, with a simplistic definition for what a word is, and with no support for stop words, nor for phrase searching.
<lang J>require'files regex strings'
rxutf8 0 NB. support latin1 searches for this example, instead of utf8 files=:words=:buckets=: wordre=: rxcomp '[\w]+' parse=: ,@:rxfrom~ wordre&rxmatches
invert=: verb define
files=: files,todo=. ~.y-.files >invert1 each todo
)
invert1=: verb define
file=. files i.<y words=: ~.words,contents=. ~.parse tolower fread jpath y ind=. words i. contents buckets=: buckets,(1+words -&# buckets)#a: #buckets=: (file,~each ind{buckets) ind}buckets
)
search=: verb define
hits=. buckets{~words i.~.parse tolower y files {~ >([-.-.)each/hits
)</lang>
Example use:
<lang J> invert '~help/primer/cut.htm';'~help/primer/end.htm';'~help/primer/gui.htm'
>search 'finally learning'
~help/primer/end.htm ~help/primer/gui.htm
>search 'argument'
~help/primer/cut.htm ~help/primer/gui.htm
>search 'around'
~help/primer/gui.htm</lang>
OCaml
We store the inverted index data in the file "data.inv" using the sexplib library, so we compile with:
ocamlc -c \ -pp "camlp4o -I `ocamlc -where`/type-conv \ -I `ocamlc -where`/sexplib \ pa_type_conv.cmo pa_sexp_conv.cmo" \ unix.cma bigarray.cma nums.cma -I +sexplib sexplib.cma str.cma \ inv.ml ocamlc -o inv.byte unix.cma bigarray.cma nums.cma -I +sexplib sexplib.cma str.cma inv.cmo
<lang ocaml>TYPE_CONV_PATH "Inverted_index"
type files = string array with sexp type inverted_index = (string * int list) list with sexp
type t = files * inverted_index with sexp
open Sexplib
let data_file = "data.inv" let data_path = Filename.concat Filename.temp_dir_name data_file
let get_inv_index() =
if Sys.file_exists data_path then t_of_sexp(Sexp.load_sexp data_path) else ([| |], [])
let load_file f =
let ic = open_in f in let n = in_channel_length ic in let s = String.create n in really_input ic s 0 n; close_in ic; (s)
let array_push ar v =
let len = Array.length ar in Array.init (succ len) (fun i -> if i < len then Array.unsafe_get ar i else v), len
let uniq lst =
let h = Hashtbl.create (List.length lst) in List.iter (fun x -> Hashtbl.replace h x ()) lst; Hashtbl.fold (fun x () xs -> x :: xs) h []
let combine words i inv_index =
let h = Hashtbl.create (List.length inv_index) in List.iter (fun (w, from) -> Hashtbl.replace h w from) inv_index; List.iter (fun w -> if Hashtbl.mem h w then begin let from = Hashtbl.find h w in Hashtbl.replace h w (i::from) end else Hashtbl.add h w [i] ) words; Hashtbl.fold (fun w from acc -> (w, from) :: acc) h []
let words_of_file in_file =
let str = load_file in_file in let words = Str.split (Str.regexp "[ \r\n\t,;.?!:'/\034()]") str in let words = uniq words in (words)
let index_file in_file =
let words = words_of_file in_file in let files, inv_index = get_inv_index() in let files, i = array_push files in_file in let inv_index = combine words i inv_index in let se = sexp_of_t (files, inv_index) in Sexp.save data_path se
let search_word word =
let files, inv_index = get_inv_index() in try let is_in = List.assoc word inv_index in List.iter (fun i -> print_endline files.(i)) is_in with Not_found -> print_endline "# Not Found"
let usage() =
Printf.printf "Usage: %s \ --index-file <file.txt> / \ --search-word <some-word>\n%!" Sys.argv.(0); exit 1
let () =
let cmd, arg = try (Sys.argv.(1), Sys.argv.(2)) with _ -> usage() in match cmd, arg with | "--index-file", in_file -> index_file in_file | "--search-word", word -> search_word word | _ -> usage()</lang>
Perl
<lang perl>use Set::Object 'set';
- given an array of files, returns the index
sub createindex {
my @files = @_;
my %iindex;
foreach my $file (@files) {
open(F, "<", $file) or die "Can't read file $file: $!"; while(<F>) {
s/\A\W+//;
foreach my $w (map {lc} grep {length() >= 3} split /\W+/) { if ( exists($iindex{$w}) ) { $iindex{$w}->insert($file); } else { $iindex{$w} = set($file); } } } close(F);
} return %iindex;
}
- given an index, search for words
sub search_words_with_index {
my %idx = %{shift()}; my @words = @_; my $res = set(); foreach my $w (map {lc} @words) {
$w =~ s/\W+//g; # strip non-words chars
length $w < 3 and next; exists $idx{$w} or return set(); $res = $res->is_null ? set(@{$idx{$w}}) : $res * $idx{$w}; # set intersection } return @$res;
}
- TESTING
- USAGE: invidx.pl the,list,of,words file1 file2 .. fileN
my @searchwords = split /,/, shift;
- first arg is a comma-separated list of words to search for
print "$_\n"
foreach search_words_with_index({createindex(@ARGV)}, @searchwords);</lang>
PicoLisp
Assuming three files "file1", "file2" and "file3":
$ cat file1 it is what it is $ cat file2 what is it $ cat file3 it is a banana
we can read them into a binary tree in the global variable '*MyIndex' <lang PicoLisp>(off *MyIndex)
(use Word
(for File '("file1" "file2" "file3") (in File (while (skip) (if (idx '*MyIndex (setq Word (till " ^I^J^M" T)) T) (push1 (car @) File) (set Word (cons File)) ) ) ) ) )
(de searchFor @
(apply sect (extract '((Word) (val (car (idx '*MyIndex Word)))) (rest) ) ) )</lang>
Output:
: (searchFor "what" "is" "it") -> ("file2" "file1") : (searchFor "a" "banana") -> ("file3") : (searchFor "it" "is") -> ("file3" "file2" "file1")
Python
Simple inverted index
First the simple inverted index from here together with an implementation of a search for (multiple) terms from that index. <lang python> This implements: http://en.wikipedia.org/wiki/Inverted_index of 28/07/10
from pprint import pprint as pp from glob import glob try: reduce except: from functools import reduce try: raw_input except: raw_input = input
def parsetexts(fileglob='InvertedIndex/T*.txt'):
texts, words = {}, set() for txtfile in glob(fileglob): with open(txtfile, 'r') as f: txt = f.read().split() words |= set(txt) texts[txtfile.split('\\')[-1]] = txt return texts, words
def termsearch(terms): # Searches simple inverted index
return reduce(set.intersection, (invindex[term] for term in terms), set(texts.keys()))
texts, words = parsetexts() print('\nTexts') pp(texts) print('\nWords') pp(sorted(words))
invindex = {word:set(txt
for txt, wrds in texts.items() if word in wrds) for word in words}
print('\nInverted Index') pp({k:sorted(v) for k,v in invindex.items()})
terms = ["what", "is", "it"] print('\nTerm Search for: ' + repr(terms)) pp(sorted(termsearch(terms)))</lang>
Sample Output
Texts {'T0.txt': ['it', 'is', 'what', 'it', 'is'], 'T1.txt': ['what', 'is', 'it'], 'T2.txt': ['it', 'is', 'a', 'banana']} Words ['a', 'banana', 'is', 'it', 'what'] Inverted Index {'a': ['T2.txt'], 'banana': ['T2.txt'], 'is': ['T0.txt', 'T1.txt', 'T2.txt'], 'it': ['T0.txt', 'T1.txt', 'T2.txt'], 'what': ['T0.txt', 'T1.txt']} Term Search for: ['what', 'is', 'it'] ['T0.txt', 'T1.txt']
Full inverted index
There is a re-write of the termsearch
function to work off this type of index, as well as a new phrasesearch
function
The phrasesearch
function will return multiple matches in a text, and goes on to show how this can be used to pick the text with most matches.
It is assumed that the following code is added to the end of the code for the simple case above and so shares its file opening and parsing results <lang python>from collections import Counter
def termsearch(terms): # Searches full inverted index
if not set(terms).issubset(words): return set() return reduce(set.intersection, (set(x[0] for x in txtindx) for term, txtindx in finvindex.items() if term in terms), set(texts.keys()) )
def phrasesearch(phrase):
wordsinphrase = phrase.strip().strip('"').split() if not set(wordsinphrase).issubset(words): return set() #firstword, *otherwords = wordsinphrase # Only Python 3 firstword, otherwords = wordsinphrase[0], wordsinphrase[1:] found = [] for txt in termsearch(wordsinphrase): # Possible text files for firstindx in (indx for t,indx in finvindex[firstword] if t == txt): # Over all positions of the first word of the phrase in this txt if all( (txt, firstindx+1 + otherindx) in finvindex[otherword] for otherindx, otherword in enumerate(otherwords) ): found.append(txt) return found
finvindex = {word:set((txt, wrdindx)
for txt, wrds in texts.items() for wrdindx in (i for i,w in enumerate(wrds) if word==w) if word in wrds) for word in words}
print('\nFull Inverted Index') pp({k:sorted(v) for k,v in finvindex.items()})
print('\nTerm Search on full inverted index for: ' + repr(terms)) pp(sorted(termsearch(terms)))
phrase = '"what is it"' print('\nPhrase Search for: ' + phrase) print(phrasesearch(phrase))
- Show multiple match capability
phrase = '"it is"' print('\nPhrase Search for: ' + phrase) ans = phrasesearch(phrase) print(ans) ans = Counter(ans) print(' The phrase is found most commonly in text: ' + repr(ans.most_common(1)[0][0]))</lang>
Sample Output
Full Inverted Index {'a': [('T2.txt', 2)], 'banana': [('T2.txt', 3)], 'is': [('T0.txt', 1), ('T0.txt', 4), ('T1.txt', 1), ('T2.txt', 1)], 'it': [('T0.txt', 0), ('T0.txt', 3), ('T1.txt', 2), ('T2.txt', 0)], 'what': [('T0.txt', 2), ('T1.txt', 0)]} Term Search on full inverted index for: ['what', 'is', 'it'] ['T0.txt', 'T1.txt'] Phrase Search for: "what is it" ['T1.txt'] Phrase Search for: "it is" ['T0.txt', 'T0.txt', 'T2.txt'] The phrase is found most commonly in text: 'T0.txt'
REXX
Note: In this algorithm, word indices start at 1. <lang rexx> /*REXX program illustrates building a simple inverted index & word find.*/
@.= /*dictionary of words (so far).*/ != /*a list of found words (so far).*/
call invertI 0,'BURMA0.TXT' /*read file 0 ... */ call invertI 1,'BURMA1.TXT' /* " " 1 ... */ call invertI 2,'BURMA2.TXT' /* " " 2 ... */ call invertI 3,'BURMA3.TXT' /* " " 3 ... */ call invertI 4,'BURMA4.TXT' /* " " 4 ... */ call invertI 5,'BURMA5.TXT' /* " " 5 ... */ call invertI 6,'BURMA6.TXT' /* " " 6 ... */ call invertI 7,'BURMA7.TXT' /* " " 7 ... */ call invertI 8,'BURMA8.TXT' /* " " 8 ... */ call invertI 9,'BURMA9.TXT' /* " " 9 ... */
call findAword 'does' /*find a word. */ call findAword '60' /*find another word. */ call findAword "don't" /*and find another word. */ call findAword "burma-shave" /*and find yet another word. */ exit /*enough of this, I'm tired. */
/*─────────────────────────────────────FINDAWORD subroutine─────────────*/ findAword: procedure expose @. /*get A word, and uppercase it. */ parse arg ox; arg x /*OX= word; X= uppercase version*/ _=@.x oxo='───'ox"───"
if _== then do
say 'word' oxo "not found." return 0 end
_@=_ /*save _, pass it back to invoker*/ say 'word' oxo "found in:"
do until _== parse var _ f w _ say ' file='f ' word='w end
return _@
/*─────────────────────────────────────INVERTI subroutine───────────────*/
invertI: procedure expose @. !; parse arg #,fn /*file#, filename*/
call lineout fn /*close the file, just in case. */
w=0 /*number of words so far. */
do while lines(fn)\==0 /*read the entire file. */ _=linein(fn) /*read the file, 1 line at a time*/ _=space(_); if _= then iterate /*if blank record, then ignore it*/
say 'file' #",record="_ /*echo a record, just to be verbose.*/
upper _ /*make it case insensative. */
do until _== /*pick off words until done. */ parse var _ xxx _ /*pick off a word. */ xxx=stripper(xxx) /*go and strip off ending punct. */ if xxx= then iterate /*is the word now blank (null) ? */ w=w+1 /*bump the word counter. */ @.xxx=@.xxx # w if wordpos(xxx,!)==0 then !=! xxx /*add to THE list of words found.*/ end
end
call lineout fn /*close the file, just to be neat*/ return w
/*─────────────────────────────────────STRIPPER subroutine──────────────*/
stripper: procedure; parse arg q /*remove punctuation at word-end.*/
@punctuation='.,:;?¿!¡' /*serveral punctuation marks. */
do j=1 for length(@punctuation) q=strip(q,'T',substr(@punctuation,j,1)) end
return q </lang> Output:
file 0,record=Rip a fender file 0,record=off your car file 0,record=send it in file 0,record=for a half-pound jar file 0,record=Burma-shave file 1,record=A peach file 1,record=looks good file 1,record=with lots of fuzz file 1,record=but a man's no peach file 1,record=and never was file 1,record=Burma-shave file 2,record=Does your husband file 2,record=misbehave file 2,record=grunt and grumble file 2,record=rant and rave ? file 2,record=shoot the brute some file 2,record=Burma-shave file 3,record=Don't take a curve file 3,record=at 60 per file 3,record=we hate to lose file 3,record=a customer file 3,record=Burma-shave file 4,record=Every shaver file 4,record=now can snore file 4,record=six more minutes file 4,record=than before file 4,record=by using file 4,record=Burma-shave file 5,record=He played file 5,record=a sax file 5,record=had no B.O. file 5,record=but his whiskers scratched file 5,record=so they let him go file 5,record=Burma-shave file 6,record=Henry the Eighth file 6,record=Prince of Friskers file 6,record=lost five wives file 6,record=but kept his whiskers file 6,record=Burma-shave file 7,record=Listen, birds file 7,record=those signs cost file 7,record=money file 7,record=so roost a while but file 7,record=don't get funny file 7,record=Burma-shave file 8,record=My man file 8,record=won't shave file 8,record=sez Hazel Huz file 8,record=but I should worry file 8,record=Dora's does file 8,record=Burma-shave file 9,record=Past schoolhouses file 9,record=take it slow file 9,record=let the little file 9,record=shavers file 9,record=grow file 9,record=Burma-shave word ---does--- found in: file=2 word=1 file=8 word=13 word ---60--- found in: file=3 word=6 word ---don't--- found in: file=3 word=1 file=7 word=12 word ---burma-shave--- found in: file=0 word=14 file=1 word=17 file=2 word=15 file=3 word=14 file=4 word=13 file=5 word=17 file=6 word=14 file=7 word=15 file=8 word=14 file=9 word=11
Ruby
I broke this into two parts, storing the index as a file on disk to better represent how this might actually be used in practice. The indexmerge part will create or update the index data file with any files given on the command line, and then indexsearch will use the data file to search for any terms listed on the command line. The example is based on http://en.wikipedia.org/wiki/Inverted_index of 2010/09/10.
indexmerge.rb <lang ruby>if File.exist? "index.dat"
@data = Marshal.load open("index.dat")
else
@data = {}
end
- Let's give the string class the ability to tokenize itsself into lowercase
- words with no punctuation.
class String
def index_sanitize self.split.collect do |token| token.downcase.gsub(/\W/, ) end end
end
- Just implementing a simple inverted index here.
ARGV.each do |filename|
open filename do |file| file.read.index_sanitize.each do |word| @data[word] ||= [] @data[word] << filename unless @data[word].include? filename end end
end
open("index.dat", "w") do |index|
index.write Marshal.dump(@data)
end</lang>
indexsearch.rb <lang ruby>if File.exist? "index.dat"
@data = Marshal.load open("index.dat")
else
raise "The index data file could not be located."
end
class String
def index_sanitize self.split.collect do |token| token.downcase.gsub(/\W/, ) end end
end
- Take anything passed in on the command line in any form and break it
- down the same way we did when making the index.
ARGV.join(' ').index_sanitize.each do |word|
@result ||= @data[word] @result &= @data[word]
end
p @result</lang>
Output
> ./indexmerge.rb file1 > ./indexmerge.rb file2 file3 > ./indexsearch.rb what is it ["file1", "file2"] > ./indexsearch.rb "a banana" ["file3"] > ./indexsearch.rb It iS\! ["file1", "file2", "file3"]
Tcl
<lang tcl>package require Tcl 8.5 proc wordsInString str {
# We define "words" to be "maximal sequences of 'word' characters". # The other possible definition is to use 'non-space' characters. regexp -all -inline {\w+} $str
}
- Adds a document to the index. The index is a map from words to a map
- from filenames to lists of word locations.
proc addDocumentToIndex {filename} {
global index set f [open $filename] set data [read $f] close $f
set i 0 array set localidx {} foreach word [wordsInString $data] {
lappend localidx($word) $i incr i
}
# Transcribe into global index foreach {word places} [array get localidx] {
dict set index($word) $filename $places
}
}
- How to use the index to find files containing a word
proc findFilesForWord {word} {
global index if {[info exists index($word)]} {
return [dict keys $index($word)]
}
}
- How to use the index to find files containing all words from a list.
- Note that this does not use the locations within the file.
proc findFilesWithAllWords {words} {
set files [findFilesForWord [lindex $words 0]] foreach w [lrange $words 1 end] {
set wf [findFilesForWord $w] set newfiles {} foreach f $files { if {$f in $wf} {lappend newfiles $f} } set files $newfiles
} return $files
}
- How to use the index to find a sequence of words in a file.
proc findFilesWithWordSequence {words} {
global index set files {} foreach w $words {
if {![info exist index($w)]} { return }
} dict for {file places} $index([lindex $words 0]) {
if {$file in $files} continue foreach start $places { set gotStart 1 foreach w [lrange $words 1 end] { incr start set gotNext 0 foreach {f ps} $index($w) { if {$f ne $file} continue foreach p $ps { if {$p == $start} { set gotNext 1 break } } if {$gotNext} break } if {!$gotNext} { set gotStart 0 break } } if {$gotStart} { lappend files $file break } }
} return $files
}</lang> For the GUI: <lang tcl>package require Tk pack [labelframe .files -text Files] -side left -fill y pack [listbox .files.list -listvariable files] pack [button .files.add -command AddFile -text "Add File to Index"] pack [labelframe .found -text Found] -side right -fill y pack [listbox .found.list -listvariable found] -fill x pack [entry .found.entry -textvariable terms] -fill x pack [button .found.findAll -command FindAll \ -text "Find File with All"] -side left pack [button .found.findSeq -command FindSeq \ -text "Find File with Sequence"] -side right
- The actions invoked by various GUI buttons
proc AddFile {} {
global files set f [tk_getOpenFile] if {$f ne ""} {
addDocumentToIndex $f lappend files $f
}
} proc FindAll {} {
global found terms set words [wordsInString $terms] set fs [findFilesWithAllWords $words] lappend found "Searching for files with all $terms" {*}$fs \
"---------------------" } proc FindSeq {} {
global found terms set words [wordsInString $terms] set fs [findFilesWithWordSequence $words] lappend found "Searching for files with \"$terms\"" {*}$fs \
"---------------------" }</lang>
TUSCRIPT
<lang tuscript> $$ MODE TUSCRIPT
files="file1'file2'file3" LOOP file=files ERROR/STOP CREATE (file,seq-o,-std-) ENDLOOP
content1="it is what it is" content2="what is it" content3="it is a banana"
FILE/ERASE "file1" = content1 FILE/ERASE "file2" = content2 FILE/ERASE "file3" = content3
ASK "search for": search="" IF (search=="") STOP
BUILD R_TABLE/USER/AND search = * DATA {search}
LOOP/CLEAR file=files
ACCESS q: READ/RECORDS $file s.z/u,content,count LOOP COUNT/NEXT/EXIT q (-; search;-;-) IF (count!=0) files=APPEND (files," ",file) ENDLOOP ENDACCESs q
ENDLOOP PRINT "-> ",files </lang> Output:
search for >what is it -> file1 file2 search for >banana -> file3 search for >it is -> file1 file2 file3
UNIX Shell
Associative array
<lang bash>#!/bin/ksh
typeset -A INDEX
function index {
typeset num=0 for file in "$@"; do tr -s '[:punct:]' ' ' < "$file" | while read line; do for token in $line; do INDEX[$token][$num]=$file done done ((++num)) done
}
function search {
for token in "$@"; do for file in "${INDEX[$token][@]}"; do echo "$file" done done | sort | uniq -c | while read count file; do (( count == $# )) && echo $file done
}</lang>
Example use: <lang korn>index *.txt search hello world </lang>
Directory on filesystem
The following is an attempt (not yet complete) to port the above script to pdksh, and perhaps other Bourne-compatible shells.
- TODO Fill in "search.sh".
- Add note about slowness.
<lang bash>#!/bin/sh
- index.sh - create an inverted index
unset IFS
- ${INDEX:=index}
- Prohibit '\n' in filenames (because '\n' is
- the record separator for $INDEX/all.tab).
for file in "$@"; do # Use printf(1), not echo, because "$file" might start with # a hyphen and become an option to echo. test 0 -eq $(printf %s "$file" | wc -l) || { printf '%s\n' "$file: newline in filename" >&2 exit 1 } done
- Make a new directory for the index, or else
- exit with the error message from mkdir(1).
mkdir "$INDEX" || exit $?
fi=1 for file in "$@"; do printf %s "Indexing $file." >&2
# all.tab maps $fi => $file echo "$fi $file" >> "$INDEX/all.tab"
# Use punctuation ([:punct:]) and whitespace (IFS) # to split tokens. ti=1 tr -s '[:punct:]' ' ' < "$file" | while read line; do for token in $line; do # Index token by position ($fi, $ti). Ignore # error from mkdir(1) if directory exists. mkdir "$INDEX/$token" 2>/dev/null echo $ti >> "$INDEX/$token/$fi" : $((ti += 1))
# Show progress. Print a dot per 1000 tokens. case "$ti" in *000) printf . esac done done
echo >&2 : $((fi += 1)) done</lang>
<lang bash>#!/bin/sh
- search.sh - search an inverted index
unset IFS
- ${INDEX:=index}
want=sequence while getopts aos name; do case "$name" in a) want=all;; o) want=one;; s) want=sequence;; *) exit 2;; esac done shift $((OPTIND - 1))
all() { echo "TODO" exit 2 }
one() { echo "TODO" exit 2 }
sequence() { echo "TODO" exit 2 }
$want "$@"</lang>