Levenshtein distance/Alignment

From Rosetta Code
Revision as of 15:11, 18 October 2016 by rosettacode>Mrmartin (Mathematica code calculates distance, but does not give alignment)
Levenshtein distance/Alignment is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.

The Levenshtein distance algorithm returns the number of atomic operations (insertion, deletion or edition) that must be performed on a string in order to obtain an other one, but it does not say anything about the actual operations used or their order.

An alignment is a notation used to describe the operations used to turn a string into an other. At some point in the strings, the minus character ('-') is placed in order to signify that a character must be added at this very place. For instance, an alignment between the words 'place' and 'palace' is:

P-LACE
PALACE


Task

Write a function that shows the alignment of two strings for the corresponding levenshtein distance.

As an example, use the words "rosettacode" and "raisethysword".

You can either implement an algorithm, or use a dedicated library (thus showing us how it is named in your language).

C

<lang c>#include <stdio.h>

  1. include <stdlib.h>
  2. include <string.h>

typedef struct edit_s edit_t, *edit; struct edit_s { char c1, c2; int n; edit next; };

void leven(char *a, char *b) { int i, j, la = strlen(a), lb = strlen(b); edit *tbl = malloc(sizeof(edit) * (1 + la)); tbl[0] = calloc((1 + la) * (1 + lb), sizeof(edit_t)); for (i = 1; i <= la; i++) tbl[i] = tbl[i-1] + (1+lb);

for (i = la; i >= 0; i--) { char *aa = a + i; for (j = lb; j >= 0; j--) { char *bb = b + j; if (!*aa && !*bb) continue;

edit e = &tbl[i][j]; edit repl = &tbl[i+1][j+1]; edit dela = &tbl[i+1][j]; edit delb = &tbl[i][j+1];

e->c1 = *aa; e->c2 = *bb; if (!*aa) { e->next = delb; e->n = e->next->n + 1; continue; } if (!*bb) { e->next = dela; e->n = e->next->n + 1; continue; }

e->next = repl; if (*aa == *bb) { e->n = e->next->n; continue; }

if (e->next->n > delb->n) { e->next = delb; e->c1 = 0; } if (e->next->n > dela->n) { e->next = dela; e->c1 = *aa; e->c2 = 0; } e->n = e->next->n + 1; } }

edit p = tbl[0]; printf("%s -> %s: %d edits\n", a, b, p->n);

while (p->next) { if (p->c1 == p->c2) printf("%c", p->c1); else { putchar('('); if (p->c1) putchar(p->c1); putchar(','); if (p->c2) putchar(p->c2); putchar(')'); }

p = p->next; } putchar('\n');

free(tbl[0]); free(tbl); }

int main(void) { leven("raisethysword", "rosettacode"); return 0; }</lang>

Output:
raisethysword -> rosettacode: 8 edits
r(a,o)(i,)set(h,t)(y,a)(s,c)(w,)o(r,d)(d,e)

D

Using the standard library. <lang d>void main() {

   import std.stdio, std.algorithm;
   immutable s1 = "rosettacode";
   immutable s2 = "raisethysword";
   string s1b, s2b;
   size_t pos1, pos2;
   foreach (immutable c; levenshteinDistanceAndPath(s1, s2)[1]) {
       final switch (c) with (EditOp) {
           case none, substitute:
               s1b ~= s1[pos1++];
               s2b ~= s2[pos2++];
               break;
           case insert:
               s1b ~= "_";
               s2b ~= s2[pos2++];
               break;
           case remove:
               s1b ~= s1[pos1++];
               s2b ~= "_";
               break;
       }
   }
   writeln(s1b, "\n", s2b);

}</lang>

Output:
r_oset_tacode
raisethysword

Go

Library: biogo

Alignment computed by the Needleman-Wunch algorithm, which is used in bioinformatics. <lang go>package main

import (

   "fmt"
   "github.com/biogo/biogo/align"
   ab "github.com/biogo/biogo/alphabet"
   "github.com/biogo/biogo/feat"
   "github.com/biogo/biogo/seq/linear"

)

func main() {

   // Alphabets for things like DNA are predefined in biogo, but we
   // define our own here.
   lc := ab.Must(ab.NewAlphabet("-abcdefghijklmnopqrstuvwxyz",
       feat.Undefined, '-', 0, true))
   // Construct scoring matrix for Needleman-Wunch algorithm.
   // We leave zeros on the diagonal for the Levenshtein distance of an
   // exact match and put -1s everywhere else for the Levenshtein distance
   // of an edit.
   nw := make(align.NW, lc.Len())
   for i := range nw {
       r := make([]int, lc.Len())
       nw[i] = r
       for j := range r {
           if j != i {
               r[j] = -1
           }
       }
   }
   // define input sequences
   a := &linear.Seq{Seq: ab.BytesToLetters([]byte("rosettacode"))}
   a.Alpha = lc
   b := &linear.Seq{Seq: ab.BytesToLetters([]byte("raisethysword"))}
   b.Alpha = lc
   // perform alignment
   aln, err := nw.Align(a, b)
   // format and display result
   if err != nil {
       fmt.Println(err)
       return
   }
   fa := align.Format(a, b, aln, '-')
   fmt.Printf("%s\n%s\n", fa[0], fa[1])
   aa := fmt.Sprint(fa[0])
   ba := fmt.Sprint(fa[1])
   ma := make([]byte, len(aa))
   for i := range ma {
       if aa[i] == ba[i] {
           ma[i] = ' '
       } else {
           ma[i] = '|'
       }
   }
   fmt.Println(string(ma))

}</lang>

Output:

The lines after the alignment point out the 8 edits.

r-oset-tacode
raisethysword
 ||   |||| ||

Java

<lang java>public class LevenshteinAlignment {

   public static String[] alignment(String a, String b) {
       a = a.toLowerCase();
       b = b.toLowerCase();
       // i == 0
       int[][] costs = new int[a.length()+1][b.length()+1];
       for (int j = 0; j <= b.length(); j++)
           costs[0][j] = j;
       for (int i = 1; i <= a.length(); i++) {
           costs[i][0] = i;
           for (int j = 1; j <= b.length(); j++) {
               costs[i][j] = Math.min(1 + Math.min(costs[i-1][j], costs[i][j-1]), a.charAt(i - 1) == b.charAt(j - 1) ? costs[i-1][j-1] : costs[i-1][j-1] + 1);
           }
       }

// walk back through matrix to figure out path StringBuilder aPathRev = new StringBuilder(); StringBuilder bPathRev = new StringBuilder(); for (int i = a.length(), j = b.length(); i != 0 && j != 0; ) { if (costs[i][j] == (a.charAt(i - 1) == b.charAt(j - 1) ? costs[i-1][j-1] : costs[i-1][j-1] + 1)) { aPathRev.append(a.charAt(--i)); bPathRev.append(b.charAt(--j)); } else if (costs[i][j] == 1 + costs[i-1][j]) { aPathRev.append(a.charAt(--i)); bPathRev.append('-'); } else if (costs[i][j] == 1 + costs[i][j-1]) { aPathRev.append('-'); bPathRev.append(b.charAt(--j)); } }

       return new String[]{aPathRev.reverse().toString(), bPathRev.reverse().toString()};
   }
   public static void main(String[] args) {

String[] result = alignment("rosettacode", "raisethysword"); System.out.println(result[0]); System.out.println(result[1]);

   }

}</lang>

Output:
r-oset-tacode
raisethysword

Mathematica / Wolfram Language

This example is incorrect. It does not accomplish the given task. Please fix the code and remove this message.

<lang Mathematica>DamerauLevenshteinDistance["rosettacode", "raisethysword"]</lang>

Output:
8

Perl

<lang perl>use strict; use warnings;

use List::Util qw(min);

sub levenshtein_distance_alignment {

   my @s = ('^', split //, shift);
   my @t = ('^', split //, shift);

   my @A;
   @{$A[$_][0]}{qw(d s t)} = ($_, join(, @s[1 .. $_]), ('~' x $_)) for 0 .. $#s;
   @{$A[0][$_]}{qw(d s t)} = ($_, ('-' x $_), join , @t[1 .. $_])  for 0 .. $#t;
   for my $i (1 .. $#s) {
       for my $j (1 .. $#t) {

if ($s[$i] ne $t[$j]) { $A[$i][$j]{d} = 1 + ( my $min = min $A[$i-1][$j]{d}, $A[$i][$j-1]{d}, $A[$i-1][$j-1]{d} ); @{$A[$i][$j]}{qw(s t)} = $A[$i-1][$j]{d} == $min ? ($A[$i-1][$j]{s}.$s[$i], $A[$i-1][$j]{t}.'-') : $A[$i][$j-1]{d} == $min ? ($A[$i][$j-1]{s}.'-', $A[$i][$j-1]{t}.$t[$j]) : ($A[$i-1][$j-1]{s}.$s[$i], $A[$i-1][$j-1]{t}.$t[$j]); }

           else {

@{$A[$i][$j]}{qw(d s t)} = ( $A[$i-1][$j-1]{d}, $A[$i-1][$j-1]{s}.$s[$i], $A[$i-1][$j-1]{t}.$t[$j] );

           }
       }
   }
   return @{$A[-1][-1]}{'s', 't'};

}

print join "\n", levenshtein_distance_alignment "rosettacode", "raisethysword";</lang>

Output:
ro-settac-o-de
raisethysword-

Perl 6

This example is incorrect. It does not accomplish the given task. Please fix the code and remove this message.
Translation of: Perl

<lang Perl 6>sub align ( Str $σ, Str $t ) {

   my @s = *, $σ.comb;
   my @t = *, $t.comb;
    
   my @A;
   @A[$_][ 0]<d s t> = $_, @s[1..$_].join, '-' x $_ for ^@s;
   @A[ 0][$_]<d s t> = $_, '-' x $_, @t[1..$_].join for ^@t;
    
   for 1 ..^ @s X 1..^ @t -> (\i, \j) {

if @s[i] ne @t[j] { @A[i][j]<d> = 1 + my $min = min @A[i-1][j]<d>, @A[i][j-1]<d>, @A[i-1][j-1]<d>; @A[i][j] = @A[i-1][j]<d> == $min ?? (@A[i-1][j] Z~ @s[i], '-') !! @A[i][j-1]<d> == $min ?? (@A[i][j-1] Z~ '-', @t[j]) !! (@A[i-1][j-1] Z~ @s[i], @t[j]); } else { @A[i][j]<d s t> = @A[i-1][j-1]<d s t> Z~ , @s[i], @t[j]; }

   }
    
   return @A[*-1][*-1];

}

.say for align |<rosettacode raisethysword>;</lang>

Output:
ro-settac-o-de
raisethysword-

Racket

Simple version (no aligment)

First we will analyze this solution that only computes the distance. See http://blog.racket-lang.org/2012/08/dynamic-programming-versus-memoization.html for a discussion of the code.

<lang racket>#lang racket

(define (memoize f)

 (local ([define table (make-hash)])
   (lambda args
     (dict-ref! table args (λ () (apply f args))))))

(define levenshtein

 (memoize
  (lambda (s t)
    (cond
      [(and (empty? s) (empty? t)) 0]
      [(empty? s) (length t)]
      [(empty? t) (length s)]
      [else
       (if (equal? (first s) (first t))
           (levenshtein (rest s) (rest t))
           (min (add1 (levenshtein (rest s) t))
                (add1 (levenshtein s (rest t)))
                (add1 (levenshtein (rest s) (rest t)))))]))))</lang>

Demonstration: <lang racket>(levenshtein (string->list "rosettacode")

            (string->list "raisethysword"))</lang>
Output:
8

Complete version

Now we extend the code from http://blog.racket-lang.org/2012/08/dynamic-programming-versus-memoization.html to show also the alignment. The code is very similar, but it stores the partial results (number of edits and alignment of each substring) in a lev structure. <lang Racket>#lang racket

(struct lev (n s t))

(define (lev-add old n sx tx)

 (lev (+ n (lev-n old))
      (cons sx (lev-s old))
      (cons tx (lev-t old))))

(define (list-repeat n v)

 (build-list n (lambda (_) v)))

(define (memoize f)

 (local ([define table (make-hash)])
   (lambda args
     (dict-ref! table args (λ () (apply f args))))))

(define levenshtein/list

 (memoize
  (lambda (s t)
    (cond
      [(and (empty? s) (empty? t)) 
       (lev 0 '() '())]
      [(empty? s) 
       (lev (length t) (list-repeat (length t) #\-) t)]
      [(empty? t) 
       (lev (length s) s (list-repeat (length s) #\-))]
      [else
       (if (equal? (first s) (first t))
           (lev-add (levenshtein/list (rest s) (rest t))
                    0 (first s) (first t))
           (argmin lev-n (list (lev-add (levenshtein/list (rest s) t)
                                        1 (first s) #\-)
                               (lev-add (levenshtein/list s (rest t))
                                        1 #\- (first t))
                               (lev-add (levenshtein/list (rest s) (rest t))
                                        1 (first s) (first t)))))]))))

(define (levenshtein s t)

 (let ([result (levenshtein/list (string->list s) 
                                 (string->list t))])
   (values (lev-n result)
           (list->string (lev-s result)) 
           (list->string (lev-t result)))))</lang>

Demonstration: <lang racket>(let-values ([(dist exp-s exp-t)

             (levenshtein "rosettacode" "raisethysword")])
 (printf "levenshtein: ~a edits\n" dist)
 (displayln exp-s)
 (displayln exp-t))</lang>
Output:
levenshtein: 8 edits
r-oset-taco-de
raisethysword-

Ruby

Translation of: Tcl

uses "lcs" from here <lang ruby>require 'lcs'

def levenshtein_align(a, b)

 apos, bpos = LCS.new(a, b).backtrack2
 
 c = ""
 d = ""
 x0 = y0 = -1
 dx = dy = 0
 apos.zip(bpos) do |x,y|
   diff = x + dx - y - dy
   if diff < 0
     dx -= diff
     c += "-" * (-diff)
   elsif diff > 0
     dy += diff
     d += "-" * diff
   end
   c += a[x0+1..x]
   x0 = x
   d += b[y0+1..y]
   y0 = y
 end
 
 c += a[x0+1..-1]
 d += b[y0+1..-1]
 diff = a.length + y0 - b.length - x0
 if diff < 0
   c += "-" * (-diff)
 elsif diff > 0
   d += "-" * diff
 end
 [c, d]

end

puts levenshtein_align("rosettacode", "raisethysword")</lang>

Output:
r-oset-taco-de
raisethysword-

Sidef

Translation of: Perl

<lang ruby>func align(s, t) {

   s.chars!.prepend!('^')
   t.chars!.prepend!('^')
   var A = []
   for i in ^s { A[i][0]{<d s t>} = (i, s.ft(1, i).join, '~' * i) }
   for i in ^t { A[0][i]{<d s t>} = (i, '-' * i, t.ft(1, i).join) }
   s.end.times { |i|
     t.end.times { |j|
       if (s[i] != t[j]) {
         A[i][j]{:d} = 1+(
           var min = Math.min(A[i-1][j]{:d}, A[i][j-1]{:d}, A[i-1][j-1]{:d})
         )
         A[i][j]{} = @|(A[i-1][j]{:d} == min
             ? [A[i-1][j]{:s}+s[i], A[i-1][j]{:t}+'-']
             : (A[i][j-1]{:d} == min
             ? [A[i][j-1]{:s}+'-', A[i][j-1]{:t}+t[j]]
             : [A[i-1][j-1]{:s}+s[i], A[i-1][j-1]{:t}+t[j]]))
       }
       else {
         A[i][j]{<d s t>} = (
             A[i-1][j-1]{:d},
             A[i-1][j-1]{:s}+s[i],
             A[i-1][j-1]{:t}+t[j],
         )
       }
     }
   }
   return [A[-1][-1]{}]

}

align("rosettacode", "raisethysword").each { .say }</lang>

Output:
ro-settac-o-de
raisethysword-

Tcl

Library: Tcllib (Package: struct::list)

<lang tcl>package require struct::list proc levenshtein/align {a b} {

   lassign [struct::list longestCommonSubsequence [split $a ""] [split $b ""]]\

apos bpos

   set c ""
   set d ""
   set x0 [set y0 -1]
   set dx [set dy 0]
   foreach x $apos y $bpos {

if {$x+$dx < $y+$dy} { set n [expr {($y+$dy)-($x+$dx)}] incr dx $n append c [string repeat "-" $n] } elseif {$x+$dx > $y+$dy} { set n [expr {($x+$dx)-($y+$dy)}] incr dy $n append d [string repeat "-" $n] } append c [string range $a $x0+1 $x] set x0 $x append d [string range $b $y0+1 $y] set y0 $y

   }
   append c [string range $a $x0+1 end]
   append d [string range $b $y0+1 end]
   set al [string length $a]
   set bl [string length $b]
   if {$al+$y0 < $bl+$x0} {

append c [string repeat "-" [expr {$bl+$x0-$y0-$al}]]

   } elseif {$bl+$x0 < $al+$y0} {

append d [string repeat "-" [expr {$al+$y0-$x0-$bl}]]

   }
   return $c\n$d

}

puts [levenshtein/align "rosettacode" "raisethysword"]</lang>

Output:
r-oset-taco-de
raisethysword-

zkl

Translation of: Java

<lang zkl>fcn alignment(a,b){

  a,b = a.toLower(), b.toLower();
  costs := (a.len()+1).pump(List(),'wrap(a){ [1..b.len()].pump(List(a)) });
  foreach i,j in (a.len()+1, [1..b.len()]){
     costs[i][j] = ( 1 + costs[i-1][j].min(costs[i][j-1]) ) 
        .min(if(a[i-1] == b[j-1]) costs[i-1][j-1] else costs[i-1][j-1] + 1);
  }
  // walk back through matrix to figure out path
  aPathRev,bPathRev := Data(),Data();  // byte buckets
  i,j := a.len(), b.len(); 
  while(i!=0 and j!= 0){
     if (costs[i][j] == 
         ( if(a[i-1]==b[j-1]) costs[i-1][j-1] else costs[i-1][j-1]+1 )){
        aPathRev.append(a[i-=1]);

bPathRev.append(b[j-=1]);

     } else if(costs[i][j] == 1+costs[i-1][j]){

aPathRev.append(a[i-=1]); bPathRev.append("-");

     } else if (costs[i][j] == 1+costs[i][j-1]){

aPathRev.append("-"); bPathRev.append(b[j-=1]);

     }
  }
  return(aPathRev.text.reverse(), bPathRev.text.reverse())

}</lang> <lang zkl>result := alignment("rosettacode", "raisethysword"); println(result[0]); println(result[1]);</lang>

Output:
r-oset-tacode
raisethysword