Text completion: Difference between revisions

Content added Content deleted

Inline

Revision as of 00:19, 30 July 2020

Task

Write a program that takes in a user inputted word and prints out possible words that are valid in the English dictionary. Please state any dictionaries or files/binaries/dependencies used in your program. Do show the similarity of the inputted word and outcome as a percentage. Any algorithm can be used to accomplish this task.

Resources

Github Repo
Raw Text, Save as .txt file
Hamming Distance
Jaro-Winkler Distance
SoundEx Algorithm
SoundEx Algorithm Wiki
Dice's Coefficient
Dice Coefficient Wiki

Possible Output

Input word: 
complition

compaction : 80.00% similar.
completion : 90.00% similar.
completions : 81.82% similar.
complexion : 80.00% similar.

Extension

How can you make the accuracy of your program higher?

Delphi

Library: System.SysUtils

Library: System.Classes

Library: System.Math

Library: System.Generics.Collections

Translation of: Wren

Using unixdict.txt. <lang Delphi> program Text_Completion;

{$APPTYPE CONSOLE}

{$R *.res}

uses

 System.SysUtils,
 System.Classes,
 System.Math,
 System.Generics.Collections;

type

 TSujestions = TDictionary<Integer, string>;

function Levenshtein(s, t: string): integer; var

 d: array of array of integer;
 i, j, cost: integer;

begin

 SetLength(d, Length(s) + 1);
 for i := Low(d) to High(d) do
 begin
   SetLength(d[i], Length(t) + 1);
 end;

 for i := Low(d) to High(d) do
 begin
   d[i, 0] := i;
   for j := Low(d[i]) to High(d[i]) do
   begin
     d[0, j] := j;
   end;
 end;

 for i := Low(d) + 1 to High(d) do
 begin
   for j := Low(d[i]) + 1 to High(d[i]) do
   begin
     if s[i] = t[j] then
     begin
       cost := 0;
     end
     else
     begin
       cost := 1;
     end;

     d[i, j] := Min(Min(d[i - 1, j] + 1,      //deletion
       d[i, j - 1] + 1),     //insertion
       d[i - 1, j - 1] + cost  //substitution
     );
   end;
 end;
 Result := d[Length(s), Length(t)];

end;

function FindSujestions(Search: string; dict: TStringList): TSujestions; var

 I, ld: Integer;
 w: string;

begin

 Result := TSujestions.Create;
 for I := 0 to 3 do
   Result.Add(I, );

 for I := 0 to dict.Count - 1 do
 begin
   w := dict[I];
   ld := Levenshtein(Search, w);
   if ld < 4 then
     Result[ld] := Result[ld] + ' ' + w;
 end;

end;

function Similarity(Search: string; Distance: Integer): Double; var

 ALength: Double;

begin

 ALength := Search.Length;
 Result := (ALength - Distance) * 100 / ALength;

end;

var

 dict: TStringList;
 Search: string;
 i: Integer;
 Sujestions: TSujestions;

begin

 dict := TStringList.Create;
 dict.LoadFromFile('unixdict.txt');
 Search := 'complition';

 Sujestions := FindSujestions(Search, dict);

 Writeln('Input word: '#10, Search);

 for i := 1 to 3 do
 begin
   Writeln(Format('Words which are %4.1f%% similar:', [Similarity(Search, i)]));
   Writeln(sujestions[i], #10);
 end;

 Sujestions.Free;
 dict.Free;

 Readln;

end.

</lang>

Output:

Input word:
complition
Words which are 90,0% similar:
 completion

Words which are 80,0% similar:
 coalition competition compilation complexion composition

Words which are 70,0% similar:
 cognition collision combustion commotion companion compassion complain complicity compton compulsion compunction computation condition contrition demolition incompletion volition

Go

Translation of: Wren

<lang go>package main

import (

   "bytes"
   "fmt"
   "io/ioutil"
   "log"

)

func levenshtein(s, t string) int {

   d := make([][]int, len(s)+1)
   for i := range d {
       d[i] = make([]int, len(t)+1)
   }
   for i := range d {
       d[i][0] = i
   }
   for j := range d[0] {
       d[0][j] = j
   }
   for j := 1; j <= len(t); j++ {
       for i := 1; i <= len(s); i++ {
           if s[i-1] == t[j-1] {
               d[i][j] = d[i-1][j-1]
           } else {
               min := d[i-1][j]
               if d[i][j-1] < min {
                   min = d[i][j-1]
               }
               if d[i-1][j-1] < min {
                   min = d[i-1][j-1]
               }
               d[i][j] = min + 1
           }
       }

   }
   return d[len(s)][len(t)]

}

func main() {

   search := "complition"
   b, err := ioutil.ReadFile("unixdict.txt")
   if err != nil {
       log.Fatal("Error reading file")
   }
   words := bytes.Fields(b)
   var lev [4][]string
   for _, word := range words {
       s := string(word)
       ld := levenshtein(search, s)
       if ld < 4 {
           lev[ld] = append(lev[ld], s)
       }
   }
   fmt.Printf("Input word: %s\n\n", search)
   for i := 1; i < 4; i++ {
       length := float64(len(search))
       similarity := (length - float64(i)) * 100 / length
       fmt.Printf("Words which are %4.1f%% similar:\n", similarity)
       fmt.Println(lev[i])
       fmt.Println()
   }

}</lang>

Output:

Input word: complition

Words which are 90.0% similar:
[completion]

Words which are 80.0% similar:
[coalition competition compilation complexion composition]

Words which are 70.0% similar:
[cognition collision combustion commotion companion compassion complain complicity compton compulsion compunction computation condition contrition demolition incompletion volition]

Java

Github Repo Uses dependencies given. <lang Java> import java.io.File; import java.io.IOException; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Scanner;

//uses https://github.com/dwyl/english-words

public class textCompletionConcept {

   public static int correct = 0;
   public static ArrayList<String> listed = new ArrayList<>();
   public static void main(String[]args) throws IOException, URISyntaxException {
       Scanner input = new Scanner(System.in);
       System.out.println("Input word: ");
       String errorRode = input.next();
       File file = new File(new 
       File(textCompletionConcept.class.getProtectionDomain().getCodeSource().getLocation().toURI()).getPath() + File.separator + "words.txt");
       Scanner reader = new Scanner(file);
       while(reader.hasNext()){
           double percent;
           String compareToThis = reader.nextLine();
                   char[] s1 = errorRode.toCharArray();
                   char[] s2 = compareToThis.toCharArray();
                   int maxlen = Math.min(s1.length, s2.length);
                   for (int index = 0; index < maxlen; index++) {
                       String x = String.valueOf(s1[index]);
                       String y = String.valueOf(s2[index]);
                       if (x.equals(y)) {
                           correct++;
                       }
                   }
                   double length = Math.max(s1.length, s2.length);
                   percent = correct / length;
                   percent *= 100;
                   boolean perfect = false;
                   if (percent >= 80 && compareToThis.charAt(0) == errorRode.charAt(0)) {
                       if(String.valueOf(percent).equals("100.00")){
                           perfect = true;
                       }
                       String addtoit = compareToThis + " : " + String.format("%.2f", percent) + "% similar.";
                       listed.add(addtoit);
                   }
                   if(compareToThis.contains(errorRode) && !perfect && errorRode.length() * 2 > compareToThis.length()){
                       String addtoit = compareToThis + " : 80.00% similar.";
                       listed.add(addtoit);
                   }
           correct = 0;
       }

       for(String x : listed){
           if(x.contains("100.00% similar.")){
               System.out.println(x);
               listed.clear();
               break;
           }
       }

       for(String x : listed){
           System.out.println(x);
       }
   }

} </lang>

Output

Input word: 
complition
compaction : 80.00% similar.
completion : 90.00% similar.
completions : 81.82% similar.
complexion : 80.00% similar.

Process finished with exit code 0

Julia

See https://en.wikipedia.org/wiki/Levenshtein_distance, the number of one character edits to obtain one word from another. <lang julia>using StringDistances

const fname = download("https://www.mit.edu/~ecprice/wordlist.10000", "wordlist10000.txt") const words = read(fname, String) |> split .|> strip .|> string const wrd = "complition"

levdistof(n, string) = filter(w -> Levenshtein()(string, w) == n, words)

for n in 1:4

   println("Words at Levenshtein distance of $n (",
       100 - Int(round(100 * n / length(wrd))), "% similarity) from \"$wrd\": \n",
       levdistof(n, wrd), "\n")

end

</lang>

Output:

Words at Levenshtein distance of 1 (90% similarity) from "complition":
["completion"]

Words at Levenshtein distance of 2 (80% similarity) from "complition":
["coalition", "competition", "compilation", "composition"]

Words at Levenshtein distance of 3 (70% similarity) from "complition":
["companion", "competitions", "completing", "complications", "computation", "condition"]

Words at Levenshtein distance of 4 (60% similarity) from "complition":
["collection", "combination", "commission", "comparison", "compensation", "competing", "competitive", "complaint", "complete", "completed", "completely", "complexity", "compliance", "compliant", "compression", "computing", "conclusion", "conditions", "connection", "convention", "conviction", "cooperation", "corporation", "correction", "correlation", "corruption", "nomination", "opinion", "opposition", "option", "pollution", "population", "position", "simulation", "solution"]

Phix

Translation of: Julia

uses levenshtein() from Levenshtein_distance#Phix and unixdict (not the same as the one Julia uses - I did just check and indeed it does not have "collection" in it!) <lang Phix>string word = "complition" sequence words = get_text(join_path({"demo","unixdict.txt"}),GT_LF_STRIPPED) function lt(string w, integer n) return levenshtein(w,word)=n end function for n=1 to 4 do

   printf(1,"Words at Levenshtein distance of %d (%g%% similarity) from \"%s\": \n%s\n",
            {n,100-round(100*n/length(word)),word,join_by(filter(words,lt,n),1,6)})

end for</lang> Note the parameters of the filter routine lt() can be quite flexible, for instance you could instead do this (and get the same results) <lang Phix>function lt(string w, sequence nw)

   {integer n, string word} = nw

...

            {n,100-round(100*n/length(word)),word,join_by(filter(words,lt,{n,word}),1,6)})</lang>

Output:

Words at Levenshtein distance of 1 (90% similarity) from "complition":
completion

Words at Levenshtein distance of 2 (80% similarity) from "complition":
coalition   competition   compilation   complexion   composition

Words at Levenshtein distance of 3 (70% similarity) from "complition":
cognition   collision   combustion   commotion   companion   compassion
complain   complicity   compton   compulsion   compunction   computation
condition   contrition   demolition   incompletion   volition

Words at Levenshtein distance of 4 (60% similarity) from "complition":
abolition   admonition   ambition   ammunition   campion   caption
collusion   combination   commission   communion   compactify   comparison
compatriot   competitive   competitor   complaint   complete   compliant
complicate   compliment   compline   compositor   compression   conception
concision   conclusion   concretion   congestion   consolation   contention
convention   convolution   corruption   cotillion   decomposition   depletion
dominion   gumption   implosion   imposition   munition   oblivion
opinion   opposition   option   pollution   position   simpleton
soliton   solution   sorption   templeton   temptation   tomlinson

Raku

(formerly Perl 6)

Translation of: Java

<lang perl6>sub MAIN ( Str $user_word = 'complition', Str $filename = 'words.txt' ) {

   my @s1 = $user_word.comb;
   my @listed = gather for $filename.IO.lines -> $line {
       my @s2 = $line.comb;

       my $correct = 100 * sum( @s1 Zeq @s2)
                         / max(+@s1,   +@s2);

       my $score = ( $correct >= 100            and @s1[0] eq @s2[0] ) ?? 100
                !! ( $correct >=  80            and @s1[0] eq @s2[0] ) ?? $correct
                !! ( $line.contains($user_word) and @s1 * 2 > @s2    ) ?? 80
                !!                                                        0;
       take [$score, $line] if $score;
   }

   @listed = @listed[$_] with @listed.first: :k, { .[0] == 100 };

   say "{.[0].fmt('%.2f')}% {.[1]}" for @listed;

}</lang>

Output:

80.00% compaction
90.00% completion
81.82% completions
80.00% complexion

REXX

The method used to find a word (that is contained in a specified dictionary) closest to a specified string of letters is:

Perform any of (three methods of actions):

delete any letter
insert any letter

(any letter is inserted at any location)
(this includes prefixing any letter)
(this includes suffixing any letter)

substitute any letter at any location

(Only lowercase letters were used for the [above] three methods).

Perform any of the above along with any combination of any method.

This will, in effect, delete/insert/substitute any two letters:

delete + delete
delete + insert
delete + substitute
insert + delete
insert + insert
insert + substitute
substitute + delete
substitute + insert
substitute + substitute

No attempt was made to change (by any method) three letters or more. <lang rexx>/*REXX pgm finds (dictionary) words which can be found in a specified word wheel (grid).*/ parse arg what iFID . /*obtain optional arguments from the CL*/ if what==|what=="," then what= 'complition' /*Not specified? Then use the default.*/ if iFID==|iFID=="," then iFID= 'UNIXDICT.TXT' /* " " " " " " */ @abc= 'abcdefghijklmnopqrstuvwxyz' /*(Latin) lowercase letters to be used.*/ L= length(@abc) /* " " " the Latin letters. */ wrds= 0 /*# words that are in the dictionary. */ dups= 0 /*" " " " duplicates. */ ills= 0 /*" " " contain "not" letters.*/ say ' Reading the file: ' iFID /*align the text. */ @.= . /*non─duplicated dictionary words. */ $= /*the list of dictionary words in grid.*/

    do recs=0  while lines(iFID)\==0            /*process all words in the dictionary. */
    x= space( linein(iFID), 0)                  /*elide any blanks in the dictinary.   */
    if @.x\==.           then do; dups= dups+1; iterate; end  /*is this a duplicate?   */
    if \datatype(x,'M')  then do; ills= ills+1; iterate; end  /*has word non─letters?  */
    @.x=                                        /*signify that  X  is a dictionary word*/
    wrds= wrds + 1                              /*bump the number of "good" dist. words*/
    end   /*recs*/

a= say ' number of records (words) in the dictionary: ' right( commas(recs), 9) say ' number of ill─formed words in the dictionary: ' right( commas(ills), 9) say ' number of duplicate words in the dictionary: ' right( commas(dups), 9) say ' number of acceptable words in the dictionary: ' right( commas(wrds), 9) say ' the "word" to be used for text completion: ' what say call del what; a= a result; call del what,1; a= a result call ins what; a= a result; call ins what,1; a= a result call sub what; a= a result; call sub what,1; a= a result call prune

= words($)

say commas(#) ' similar words found:'

      do j=1  for #;  _= word($, j);    say right( count(_,what), 24)  _
      end   /*j*/

exit # /*stick a fork in it, we're all done. */ /*──────────────────────────────────────────────────────────────────────────────────────*/ commas: parse arg _; do ?=length(_)-3 to 1 by -3; _= insert(',', _, ?); end; return _ prune: do k=1 for words(a); _= word(a,k); if wordpos(_,$)==0 then $= $ _; end; return recur: $= $ del(z); $= $ ins(z); $= $ sub(z); return /*──────────────────────────────────────────────────────────────────────────────────────*/ count: procedure; parse arg x,y; cnt= 0; w= length(x)

        do j=1  for w;           p= pos( substr(x, j, 1), y);       if p==0  then iterate
        y= overlay(., y, p);     cnt= cnt + 1
        end   /*j*/
      return '     ' left("("format(cnt/w*100,,2)/1'%)', 9)     /*express as a percent.*/

/*──────────────────────────────────────────────────────────────────────────────────────*/ del: procedure expose @. @abc L; parse arg y,r; $=

        do j=1  for length(y);       z= space(left(y,j-1) ||           substr(y,j+1), 0)
        if @.z\==.  then $= $ z;     if r==1  then call recur
        end     /*j*/;               return space($)

/*──────────────────────────────────────────────────────────────────────────────────────*/ ins: procedure expose @. @abc L; parse arg y,r; $=

        do j=1  for length(y)
          do k=1  for L;   z= space(left(y,j-1) || substr(@abc,k,1) || substr(y,j),   0)
          if @.z\==.  then $= $ z;   if r==1  then call recur
          end   /*k*/
        end     /*j*/;               return space($)

/*──────────────────────────────────────────────────────────────────────────────────────*/ sub: procedure expose @. @abc L; parse arg y,r; $=

        do j=1  for length(y)
          do k=1  for L;   z= space(left(y,j-1) || substr(@abc,k,1) || substr(y,j+1), 0)
          if @.z\==.  then $= $ z;   if r==1  then call recur
          end   /*k*/
        end     /*j*/;               return space($)</lang>

output when using the default inputs:

                                Reading the file:  UNIXDICT.TXT
    number of  records (words) in the dictionary:     25,104
    number of ill─formed words in the dictionary:        126
    number of  duplicate words in the dictionary:          0
    number of acceptable words in the dictionary:     24,978
    the  "word"  to be used for  text completion:  complition

6  similar words found:
               (88.89%)  coalition
               (90%)     completion
               (81.82%)  competition
               (90.91%)  compilation
               (81.82%)  composition
               (80%)     complexion

The input file is the same dictionary that the Java entry used.

output when using the inputs of: , GitHub.dict

                                Reading the file:  GitHub.dict
    number of  records (words) in the dictionary:    466,551
    number of ill─formed words in the dictionary:     50,254
    number of  duplicate words in the dictionary:          0
    number of acceptable words in the dictionary:    416,297
    the  "word"  to be used for  text completion:  complition

11  similar words found:
               (88.89%)  coalition
               (90%)     completion
               (81.82%)  commolition
               (81.82%)  comparition
               (81.82%)  competition
               (90.91%)  compilation
               (81.82%)  composition
               (81.82%)  complection
               (83.33%)  complication
               (80%)     compaction
               (80%)     complexion

output when using the inputs of: , my.dict

This is my personal dictionary.

                                Reading the file:  my.dict
    number of  records (words) in the dictionary:    947,364
    number of ill─formed words in the dictionary:    192,683
    number of  duplicate words in the dictionary:          6
    number of acceptable words in the dictionary:    754,675
    the  "word"  to be used for  text completion:  complition

12  similar words found:
               (88.89%)  coalition
               (90%)     completion
               (81.82%)  commolition
               (81.82%)  comparition
               (81.82%)  competition
               (90.91%)  compilation
               (90.91%)  compiletion
               (81.82%)  composition
               (81.82%)  complection
               (83.33%)  complication
               (80%)     compaction
               (80%)     complexion

Wren

Library: Wren-fmt

This uses 'unixdict' and the Levenshtein distance algorithm to test for similarity. <lang ecmascript>import "io" for File import "/fmt" for Fmt

var levenshtein = Fn.new { |s, t|

   var ls = s.count
   var lt = t.count
   var d = List.filled(ls + 1, null)
   for (i in 0..ls) {
       d[i] = List.filled(lt + 1, 0)
       d[i][0] = i
   }
   for (j in 0..lt) d[0][j] = j
   for (j in 1..lt) {
       for (i in 1..ls) {
           if (s[i-1] == t[j-1]) {
               d[i][j] = d[i-1][j-1]
           } else {
               var min = d[i-1][j]
               if (d[i][j-1] < min) min = d[i][j-1]
               if (d[i-1][j-1] < min) min = d[i-1][j-1]
               d[i][j] = min + 1
           }
       }
   }
   return d[-1][-1]

}

var search = "complition" var words = File.read("unixdict.txt").split("\n").map { |w| w.trim() }.where { |w| w != "" } var lev = [[], [], [], []] var ld for (word in words) {

   if ((ld = levenshtein.call(search, word)) < 4) {
       lev[ld].add(word)
   }

}

System.print("Input word: %(search)\n") for (i in 1..3) {

   var count = search.count
   var similarity = (count - i) * 100 / count
   Fmt.print("Words which are $4.1f\% similar:", similarity)
   System.print(lev[i])
   System.print()

} </lang>

Output:

Words which are 90.0% similar:
[completion]

Words which are 80.0% similar:
[coalition, competition, compilation, complexion, composition]

Words which are 70.0% similar:
[cognition, collision, combustion, commotion, companion, compassion, complain, complicity, compton, compulsion, compunction, computation, condition, contrition, demolition, incompletion, volition]