Levenshtein distance/Alignment: Difference between revisions

From Rosetta Code
Content added Content deleted
(→‎{{header|Tcl}}: Added zkl)
m (→‎{{header|Perl 6}}: it doesn't output what it says it does)
Line 302: Line 302:


=={{header|Perl 6}}==
=={{header|Perl 6}}==
{{incorrect}}
{{trans|Perl}}
{{trans|Perl}}
<lang Perl 6>sub align ( Str $σ, Str $t ) {
<lang Perl 6>sub align ( Str $σ, Str $t ) {
Line 311: Line 312:
@A[ 0][$_]<d s t> = $_, '-' x $_, @t[1..$_].join for ^@t;
@A[ 0][$_]<d s t> = $_, '-' x $_, @t[1..$_].join for ^@t;
for 1 ..^ @s X 1..^ @t -> \i, \j {
for 1 ..^ @s X 1..^ @t -> (\i, \j) {
if @s[i] ne @t[j] {
if @s[i] ne @t[j] {
@A[i][j]<d> = 1 + my $min =
@A[i][j]<d> = 1 + my $min =

Revision as of 02:08, 6 February 2016

Levenshtein distance/Alignment is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.

The Levenshtein distance algorithm returns the number of atomic operations (insertion, deletion or edition) that must be performed on a string in order to obtain an other one, but it does not say anything about the actual operations used or their order.

An alignment is a notation used to describe the operations used to turn a string into an other. At some point in the strings, the minus character ('-') is placed in order to signify that a character must be added at this very place. For instance, an alignment between the words 'place' and 'palace' is:

P-LACE
PALACE

For this task, write a function that shows the alignment of two strings for the corresponding levenshtein distance. As an example, use the words "rosettacode" and "raisethysword".

You can either implement an algorithm, or use a dedicated library (thus showing us how it is named in your language).

C

<lang c>#include <stdio.h>

  1. include <stdlib.h>
  2. include <string.h>

typedef struct edit_s edit_t, *edit; struct edit_s { char c1, c2; int n; edit next; };

void leven(char *a, char *b) { int i, j, la = strlen(a), lb = strlen(b); edit *tbl = malloc(sizeof(edit) * (1 + la)); tbl[0] = calloc((1 + la) * (1 + lb), sizeof(edit_t)); for (i = 1; i <= la; i++) tbl[i] = tbl[i-1] + (1+lb);

for (i = la; i >= 0; i--) { char *aa = a + i; for (j = lb; j >= 0; j--) { char *bb = b + j; if (!*aa && !*bb) continue;

edit e = &tbl[i][j]; edit repl = &tbl[i+1][j+1]; edit dela = &tbl[i+1][j]; edit delb = &tbl[i][j+1];

e->c1 = *aa; e->c2 = *bb; if (!*aa) { e->next = delb; e->n = e->next->n + 1; continue; } if (!*bb) { e->next = dela; e->n = e->next->n + 1; continue; }

e->next = repl; if (*aa == *bb) { e->n = e->next->n; continue; }

if (e->next->n > delb->n) { e->next = delb; e->c1 = 0; } if (e->next->n > dela->n) { e->next = dela; e->c1 = *aa; e->c2 = 0; } e->n = e->next->n + 1; } }

edit p = tbl[0]; printf("%s -> %s: %d edits\n", a, b, p->n);

while (p->next) { if (p->c1 == p->c2) printf("%c", p->c1); else { putchar('('); if (p->c1) putchar(p->c1); putchar(','); if (p->c2) putchar(p->c2); putchar(')'); }

p = p->next; } putchar('\n');

free(tbl[0]); free(tbl); }

int main(void) { leven("raisethysword", "rosettacode"); return 0; }</lang>

Output:
raisethysword -> rosettacode: 8 edits
r(a,o)(i,)set(h,t)(y,a)(s,c)(w,)o(r,d)(d,e)

D

Using the standard library. <lang d>void main() {

   import std.stdio, std.algorithm;
   immutable s1 = "rosettacode";
   immutable s2 = "raisethysword";
   string s1b, s2b;
   size_t pos1, pos2;
   foreach (immutable c; levenshteinDistanceAndPath(s1, s2)[1]) {
       final switch (c) with (EditOp) {
           case none, substitute:
               s1b ~= s1[pos1++];
               s2b ~= s2[pos2++];
               break;
           case insert:
               s1b ~= "_";
               s2b ~= s2[pos2++];
               break;
           case remove:
               s1b ~= s1[pos1++];
               s2b ~= "_";
               break;
       }
   }
   writeln(s1b, "\n", s2b);

}</lang>

Output:
r_oset_tacode
raisethysword

Go

Library: biogo

Alignment computed by the Needleman-Wunch algorithm, which is used in bioinformatics. <lang go>package main

import (

   "fmt"
   "github.com/biogo/biogo/align"
   ab "github.com/biogo/biogo/alphabet"
   "github.com/biogo/biogo/feat"
   "github.com/biogo/biogo/seq/linear"

)

func main() {

   // Alphabets for things like DNA are predefined in biogo, but we
   // define our own here.
   lc := ab.Must(ab.NewAlphabet("-abcdefghijklmnopqrstuvwxyz",
       feat.Undefined, '-', 0, true))
   // Construct scoring matrix for Needleman-Wunch algorithm.
   // We leave zeros on the diagonal for the Levenshtein distance of an
   // exact match and put -1s everywhere else for the Levenshtein distance
   // of an edit.
   nw := make(align.NW, lc.Len())
   for i := range nw {
       r := make([]int, lc.Len())
       nw[i] = r
       for j := range r {
           if j != i {
               r[j] = -1
           }
       }
   }
   // define input sequences
   a := &linear.Seq{Seq: ab.BytesToLetters([]byte("rosettacode"))}
   a.Alpha = lc
   b := &linear.Seq{Seq: ab.BytesToLetters([]byte("raisethysword"))}
   b.Alpha = lc
   // perform alignment
   aln, err := nw.Align(a, b)
   // format and display result
   if err != nil {
       fmt.Println(err)
       return
   }
   fa := align.Format(a, b, aln, '-')
   fmt.Printf("%s\n%s\n", fa[0], fa[1])
   aa := fmt.Sprint(fa[0])
   ba := fmt.Sprint(fa[1])
   ma := make([]byte, len(aa))
   for i := range ma {
       if aa[i] == ba[i] {
           ma[i] = ' '
       } else {
           ma[i] = '|'
       }
   }
   fmt.Println(string(ma))

}</lang>

Output:

The lines after the alignment point out the 8 edits.

r-oset-tacode
raisethysword
 ||   |||| ||

Java

<lang java>public class LevenshteinAlignment {

   public static String[] alignment(String a, String b) {
       a = a.toLowerCase();
       b = b.toLowerCase();
       // i == 0
       int[][] costs = new int[a.length()+1][b.length()+1];
       for (int j = 0; j <= b.length(); j++)
           costs[0][j] = j;
       for (int i = 1; i <= a.length(); i++) {
           costs[i][0] = i;
           for (int j = 1; j <= b.length(); j++) {
               costs[i][j] = Math.min(1 + Math.min(costs[i-1][j], costs[i][j-1]), a.charAt(i - 1) == b.charAt(j - 1) ? costs[i-1][j-1] : costs[i-1][j-1] + 1);
           }
       }

// walk back through matrix to figure out path StringBuilder aPathRev = new StringBuilder(); StringBuilder bPathRev = new StringBuilder(); for (int i = a.length(), j = b.length(); i != 0 && j != 0; ) { if (costs[i][j] == (a.charAt(i - 1) == b.charAt(j - 1) ? costs[i-1][j-1] : costs[i-1][j-1] + 1)) { aPathRev.append(a.charAt(--i)); bPathRev.append(b.charAt(--j)); } else if (costs[i][j] == 1 + costs[i-1][j]) { aPathRev.append(a.charAt(--i)); bPathRev.append('-'); } else if (costs[i][j] == 1 + costs[i][j-1]) { aPathRev.append('-'); bPathRev.append(b.charAt(--j)); } }

       return new String[]{aPathRev.reverse().toString(), bPathRev.reverse().toString()};
   }
   public static void main(String[] args) {

String[] result = alignment("rosettacode", "raisethysword"); System.out.println(result[0]); System.out.println(result[1]);

   }

}</lang>

Output:
r-oset-tacode
raisethysword

Mathematica / Wolfram Language

<lang Mathematica>DamerauLevenshteinDistance["rosettacode", "raisethysword"]</lang>

Output:
8

Perl

<lang perl>use strict; use warnings;

use List::Util qw(min);

sub levenshtein_distance_alignment {

   my @s = ('^', split //, shift);
   my @t = ('^', split //, shift);

   my @A;
   @{$A[$_][0]}{qw(d s t)} = ($_, join(, @s[1 .. $_]), ('~' x $_)) for 0 .. $#s;
   @{$A[0][$_]}{qw(d s t)} = ($_, ('-' x $_), join , @t[1 .. $_])  for 0 .. $#t;
   for my $i (1 .. $#s) {
       for my $j (1 .. $#t) {

if ($s[$i] ne $t[$j]) { $A[$i][$j]{d} = 1 + ( my $min = min $A[$i-1][$j]{d}, $A[$i][$j-1]{d}, $A[$i-1][$j-1]{d} ); @{$A[$i][$j]}{qw(s t)} = $A[$i-1][$j]{d} == $min ? ($A[$i-1][$j]{s}.$s[$i], $A[$i-1][$j]{t}.'-') : $A[$i][$j-1]{d} == $min ? ($A[$i][$j-1]{s}.'-', $A[$i][$j-1]{t}.$t[$j]) : ($A[$i-1][$j-1]{s}.$s[$i], $A[$i-1][$j-1]{t}.$t[$j]); }

           else {

@{$A[$i][$j]}{qw(d s t)} = ( $A[$i-1][$j-1]{d}, $A[$i-1][$j-1]{s}.$s[$i], $A[$i-1][$j-1]{t}.$t[$j] );

           }
       }
   }
   return @{$A[-1][-1]}{'s', 't'};

}

print join "\n", levenshtein_distance_alignment "rosettacode", "raisethysword";</lang>

Output:
ro-settac-o-de
raisethysword-

Perl 6

This example is incorrect. It does not accomplish the given task. Please fix the code and remove this message.
Translation of: Perl

<lang Perl 6>sub align ( Str $σ, Str $t ) {

   my @s = *, $σ.comb;
   my @t = *, $t.comb;
    
   my @A;
   @A[$_][ 0]<d s t> = $_, @s[1..$_].join, '-' x $_ for ^@s;
   @A[ 0][$_]<d s t> = $_, '-' x $_, @t[1..$_].join for ^@t;
    
   for 1 ..^ @s X 1..^ @t -> (\i, \j) {

if @s[i] ne @t[j] { @A[i][j]<d> = 1 + my $min = min @A[i-1][j]<d>, @A[i][j-1]<d>, @A[i-1][j-1]<d>; @A[i][j] = @A[i-1][j]<d> == $min ?? (@A[i-1][j] Z~ @s[i], '-') !! @A[i][j-1]<d> == $min ?? (@A[i][j-1] Z~ '-', @t[j]) !! (@A[i-1][j-1] Z~ @s[i], @t[j]); } else { @A[i][j]<d s t> = @A[i-1][j-1]<d s t> Z~ , @s[i], @t[j]; }

   }
    
   return @A[*-1][*-1];

}

.say for align |<rosettacode raisethysword>;</lang>

Output:
ro-settac-o-de
raisethysword-

Racket

Simple version (no aligment)

First we will analyze this solution that only computes the distance. See http://blog.racket-lang.org/2012/08/dynamic-programming-versus-memoization.html for a discussion of the code.

<lang racket>#lang racket

(define (memoize f)

 (local ([define table (make-hash)])
   (lambda args
     (dict-ref! table args (λ () (apply f args))))))

(define levenshtein

 (memoize
  (lambda (s t)
    (cond
      [(and (empty? s) (empty? t)) 0]
      [(empty? s) (length t)]
      [(empty? t) (length s)]
      [else
       (if (equal? (first s) (first t))
           (levenshtein (rest s) (rest t))
           (min (add1 (levenshtein (rest s) t))
                (add1 (levenshtein s (rest t)))
                (add1 (levenshtein (rest s) (rest t)))))]))))</lang>

Demonstration: <lang racket>(levenshtein (string->list "rosettacode")

            (string->list "raisethysword"))</lang>
Output:
8

Complete version

Now we extend the code from http://blog.racket-lang.org/2012/08/dynamic-programming-versus-memoization.html to show also the alignment. The code is very similar, but it stores the partial results (number of edits and alignment of each substring) in a lev structure. <lang Racket>#lang racket

(struct lev (n s t))

(define (lev-add old n sx tx)

 (lev (+ n (lev-n old))
      (cons sx (lev-s old))
      (cons tx (lev-t old))))

(define (list-repeat n v)

 (build-list n (lambda (_) v)))

(define (memoize f)

 (local ([define table (make-hash)])
   (lambda args
     (dict-ref! table args (λ () (apply f args))))))

(define levenshtein/list

 (memoize
  (lambda (s t)
    (cond
      [(and (empty? s) (empty? t)) 
       (lev 0 '() '())]
      [(empty? s) 
       (lev (length t) (list-repeat (length t) #\-) t)]
      [(empty? t) 
       (lev (length s) s (list-repeat (length s) #\-))]
      [else
       (if (equal? (first s) (first t))
           (lev-add (levenshtein/list (rest s) (rest t))
                    0 (first s) (first t))
           (argmin lev-n (list (lev-add (levenshtein/list (rest s) t)
                                        1 (first s) #\-)
                               (lev-add (levenshtein/list s (rest t))
                                        1 #\- (first t))
                               (lev-add (levenshtein/list (rest s) (rest t))
                                        1 (first s) (first t)))))]))))

(define (levenshtein s t)

 (let ([result (levenshtein/list (string->list s) 
                                 (string->list t))])
   (values (lev-n result)
           (list->string (lev-s result)) 
           (list->string (lev-t result)))))</lang>

Demonstration: <lang racket>(let-values ([(dist exp-s exp-t)

             (levenshtein "rosettacode" "raisethysword")])
 (printf "levenshtein: ~a edits\n" dist)
 (displayln exp-s)
 (displayln exp-t))</lang>
Output:
levenshtein: 8 edits
r-oset-taco-de
raisethysword-

Ruby

Translation of: Tcl

uses "lcs" from here <lang ruby>require 'lcs'

def levenshtein_align(a, b)

 apos, bpos = LCS.new(a, b).backtrack2
 
 c = ""
 d = ""
 x0 = y0 = -1
 dx = dy = 0
 apos.zip(bpos) do |x,y|
   diff = x + dx - y - dy
   if diff < 0
     dx -= diff
     c += "-" * (-diff)
   elsif diff > 0
     dy += diff
     d += "-" * diff
   end
   c += a[x0+1..x]
   x0 = x
   d += b[y0+1..y]
   y0 = y
 end
 
 c += a[x0+1..-1]
 d += b[y0+1..-1]
 diff = a.length + y0 - b.length - x0
 if diff < 0
   c += "-" * (-diff)
 elsif diff > 0
   d += "-" * diff
 end
 [c, d]

end

puts levenshtein_align("rosettacode", "raisethysword")</lang>

Output:
r-oset-taco-de
raisethysword-

Tcl

Library: Tcllib (Package: struct::list)

<lang tcl>package require struct::list proc levenshtein/align {a b} {

   lassign [struct::list longestCommonSubsequence [split $a ""] [split $b ""]]\

apos bpos

   set c ""
   set d ""
   set x0 [set y0 -1]
   set dx [set dy 0]
   foreach x $apos y $bpos {

if {$x+$dx < $y+$dy} { set n [expr {($y+$dy)-($x+$dx)}] incr dx $n append c [string repeat "-" $n] } elseif {$x+$dx > $y+$dy} { set n [expr {($x+$dx)-($y+$dy)}] incr dy $n append d [string repeat "-" $n] } append c [string range $a $x0+1 $x] set x0 $x append d [string range $b $y0+1 $y] set y0 $y

   }
   append c [string range $a $x0+1 end]
   append d [string range $b $y0+1 end]
   set al [string length $a]
   set bl [string length $b]
   if {$al+$y0 < $bl+$x0} {

append c [string repeat "-" [expr {$bl+$x0-$y0-$al}]]

   } elseif {$bl+$x0 < $al+$y0} {

append d [string repeat "-" [expr {$al+$y0-$x0-$bl}]]

   }
   return $c\n$d

}

puts [levenshtein/align "rosettacode" "raisethysword"]</lang>

Output:
r-oset-taco-de
raisethysword-

zkl

Translation of: Java

<lang zkl>fcn alignment(a,b){

  a,b = a.toLower(), b.toLower();
  costs := (a.len()+1).pump(List(),'wrap(a){ [1..b.len()].pump(List(a)) });
  foreach i,j in (a.len()+1, [1..b.len()]){
     costs[i][j] = ( 1 + costs[i-1][j].min(costs[i][j-1]) ) 
        .min(if(a[i-1] == b[j-1]) costs[i-1][j-1] else costs[i-1][j-1] + 1);
  }
  // walk back through matrix to figure out path
  aPathRev,bPathRev := Data(),Data();  // byte buckets
  i,j := a.len(), b.len(); 
  while(i!=0 and j!= 0){
     if (costs[i][j] == 
         ( if(a[i-1]==b[j-1]) costs[i-1][j-1] else costs[i-1][j-1]+1 )){
        aPathRev.append(a[i-=1]);

bPathRev.append(b[j-=1]);

     } else if(costs[i][j] == 1+costs[i-1][j]){

aPathRev.append(a[i-=1]); bPathRev.append("-");

     } else if (costs[i][j] == 1+costs[i][j-1]){

aPathRev.append("-"); bPathRev.append(b[j-=1]);

     }
  }
  return(aPathRev.text.reverse(), bPathRev.text.reverse())

}</lang> <lang zkl>result := alignment("rosettacode", "raisethysword"); println(result[0]); println(result[1]);</lang>

Output:
r-oset-tacode
raisethysword