Factorize string into Lyndon words: Difference between revisions

From Rosetta Code
Content added Content deleted
(→‎{{header|RPL}}: version with local variables)
(→‎{{header|ALGOL 68}}: Fix for cases where LWB s isn't 1)
Line 25: Line 25:
PROC chen fox lyndon factorization = ( STRING s )[]STRING:
PROC chen fox lyndon factorization = ( STRING s )[]STRING:
BEGIN
BEGIN
INT n = ( UPB s - LWB s ) + 1;
INT n = UPB s;
INT i := LWB s;
INT i := LWB s;
FLEX[ 1 : 100 ]STRING factorization;
FLEX[ 1 : 100 ]STRING factorization;

Revision as of 10:10, 4 February 2024

Task
Factorize string into Lyndon words
You are encouraged to solve this task according to the task description, using any language you may know.

Given a finite alphabet, we can lexicographically order all strings in the alphabet. If two strings have different lengths, then pad the shorter string on the right with the smallest letter. For example, we have 01 > 001, but 01 = 010. As we see, this order is a total preorder, but not a total order, since it identifies different strings.

A Lyndon word is a non-empty string that is strictly lower in lexicographic order than all its circular rotations. In particular, if a string is equal to a circular rotation, then it is not a Lyndon word. For example, since 0101 = 0101 (rotation by 2), it is not a Lyndon word.

The first Lyndon words on the binary alphabet {0, 1} are:

0, 1, 01, 001, 011, 0001, 0011, 0111, 00001, 00011, 00101, 00111, 01011, 01111, ...

Some basic properties:

  • The only Lyndon word that ends with 0 is 0.
  • Proof. If s0 is a Lyndon word, and s is not empty, then s0 < 0s. If s contains 1 somewhere, then s0 > 0s. Therefore s has only 0. But then s0 = 0s, contradiction.
  • The lexicographic order is a total order on the Lyndon words.
  • Proof. For, the only way for two different strings s, s' to have the same lexicographic ordering is for one of them to pad to the other. We can assume that s00...0 = s'. If that is so, then s00...0 is a Lyndon word that ends with 0, so it is just 0, and so s is a Lyndon word that is also empty, contradiction.

The Chen–Fox–Lyndon theorem states that any string is uniquely factorizable into a sequence of Lyndon words non-decreasing in lexicographic order. Duval's algorithm computes this in O(length of input) time and and O(1) space.[1] See [2] for a description of Duval's algorithm.

ALGOL 68

Translation of: Python – with multiline output as with a number of other samples
BEGIN # Factorize a string into Lyndon words - translated of the Python sample #

    PROC chen fox lyndon factorization = ( STRING s )[]STRING:
         BEGIN
            INT n  = UPB s;
            INT i := LWB s;
            FLEX[ 1 : 100 ]STRING factorization;
            INT f max := LWB factorization;
            WHILE i <= n DO
                INT j := i + 1, k := i;
                WHILE IF j <= n THEN s[ k ] <= s[ j ] ELSE FALSE FI DO
                    IF s[ k ] < s[ j ] THEN
                        k  := i
                    ELSE
                        k +:= 1
                    FI;
                    j +:= 1
                OD;
                WHILE i <= k DO
                    f max +:= 1;
                    IF f max > UPB factorization THEN
                        [ 1 : UPB factorization + 20 ]STRING new factorization;
                        new factorization[ 1 : UPB factorization ] := factorization;
                        factorization := new factorization
                    FI;
                    factorization[ f max ] := s[ i : i + j - k - 1 ];
                    i +:= j - k
                OD
            OD;
            STRING check := "";
            FOR i TO f max DO check +:= factorization[ i ] OD;
            IF check /= s THEN print( ( "Incorrect factorization", newline ) ); stop FI;
            factorization[ 1 : f max ]
         END # chen fox lyndon factorization # ;

    # Example use with Thue-Morse sequence                                     #
    STRING m := "0";
    TO 7 DO
         STRING m0 = m;
         FOR i FROM LWB m TO UPB m DO
             IF m[ i ]  = "0" THEN m[ i ] := "1" ELIF m[ i ]  = "1" THEN m[ i ] := "0" FI
         OD;
         m0 +=: m
    OD;

    []STRING m factors = chen fox lyndon factorization( m );
    FOR i FROM LWB m factors TO UPB m factors DO
        print( ( m factors[ i ], newline ) )
    OD

END
Output:
011
01
0011
00101101
0010110011010011
00101100110100101101001100101101
001011001101001011010011001011001101001100101101
001011001101
001

C++

Copied from [2], under Creative Commons Attribution Share Alike 4.0 International License and converted to a runnable C++ program.

#include <iostream>
#include <vector>
#include <string>
#include <algorithm>

using namespace std;

vector<string> duval(string const& s) {
    int n = s.size();
    int i = 0;
    vector<string> factorization;
    while (i < n) {
        int j = i + 1, k = i;
        while (j < n && s[k] <= s[j]) {
            if (s[k] < s[j])
                k = i;
            else
                k++;
            j++;
        }
        while (i <= k) {
            factorization.push_back(s.substr(i, j - k));
            i += j - k;
        }
    }
    return factorization;
}

int main() {
    // Thue-Morse example
    string m = "0";
    for (int i = 0; i < 7; ++i) {
        string m0 = m;
        replace(m.begin(), m.end(), '0', 'a');
        replace(m.begin(), m.end(), '1', '0');
        replace(m.begin(), m.end(), 'a', '1');
        m = m0 + m;
    }
    for (string s : duval(m)) cout << s << endl;
    return 0;
}
Output:
011
01
0011
00101101
0010110011010011
00101100110100101101001100101101
001011001101001011010011001011001101001100101101
001011001101
001

Julia

Translation of: Phix
Translation of: Python
function chenfoxlyndonfactorization(s)
    n = length(s)
    i = 1
    factorization = String[]
    while i <= n
        j = i + 1
        k = i
        while j <= n && s[k] <= s[j]
            if s[k] < s[j]
                k = i
            else
                k += 1
            end
            j += 1
        end
        while i <= k
            push!(factorization, s[i:i+j-k-1])
            i += j - k
        end
    end
    @assert s == prod(factorization)
    return factorization
end

let m = "0"
    for i in 1:7
        m0 = m
        m = replace(m, '0' => 'a')
        m = replace(m, '1' => '0')
        m = replace(m, 'a' => '1')
        m = m0 * m
    end
    println(chenfoxlyndonfactorization(m))
end
Output:
["011", "01", "0011", "00101101", "0010110011010011", "00101100110100101101001100101101", "001011001101001011010011001011001101001100101101", "001011001101", "001"]


MATLAB

Translation of: Python
clear all;close all;clc;
m = '0';
for i = 1:7
    m0 = m;
    m = strrep(m, '0', 'a');
    m = strrep(m, '1', '0');
    m = strrep(m, 'a', '1');
    m = strcat(m0, m);
end

factorization = chenFoxLyndonFactorization(m);
for index=1:length(factorization)
    disp(factorization(index));
end


function factorization = chenFoxLyndonFactorization(s)
    n = length(s);
    i = 1;
    factorization = {};
    while i <= n
        j = i + 1;
        k = i;
        while j <= n && s(k) <= s(j)
            if s(k) < s(j)
                k = i;
            else
                k = k + 1;
            end
            j = j + 1;
        end
        while i <= k
            factorization{end+1} = s(i:i + j - k - 1);
            i = i + j - k;
        end
    end
    assert(strcmp(join(factorization, ''), s));
end
Output:
    {'011'}

    {'01'}

    {'0011'}

    {'00101101'}

    {'0010110011010011'}

    {'00101100110100101101001100101101'}

    {'001011001101001011010011001011001101001100101101'}

    {'001011001101'}

    {'001'}

Phix

Translation of: Python
with javascript_semantics
function chen_fox_lyndon_factorization(string s)
    integer n = length(s), i = 1
    sequence factorization = {}
    while i <= n do
        integer j = i + 1, k = i
        while j <= n and s[k] <= s[j] do
            if s[k] < s[j] then
                k = i
            else
                k += 1
            end if
            j += 1
        end while
        while i <= k do
            factorization &= {s[i..i+j-k-1]}
            i += j - k
        end while
    end while
    assert(join(factorization,"") == s)
    return factorization
end function
-- Example use with Thue-Morse sequence
string m = "0"
for i=1 to 7 do
    m &= sq_sub('0'+'1',m)
end for
?chen_fox_lyndon_factorization(m)
-- alt:
printf(1,"\n%s\n",join(chen_fox_lyndon_factorization(m),"\n"))
Output:
{"011","01","0011","00101101","0010110011010011","00101100110100101101001100101101","001011001101001011010011001011001101001100101101","001011001101","001"}
 
011
01
0011
00101101
0010110011010011
00101100110100101101001100101101
001011001101001011010011001011001101001100101101
001011001101
001

Python

Duval's algorithm:

def chen_fox_lyndon_factorization(s):
    n = len(s)
    i = 0
    factorization = []
    while i < n:
        j, k = i + 1, i
        while j < n and s[k] <= s[j]:
            if s[k] < s[j]:
                k = i
            else:
                k += 1
            j += 1
        while i <= k:
            factorization.append(s[i:i + j - k])
            i += j - k
    assert ''.join(factorization) == s
    return factorization

Example use with Thue-Morse sequence

m='0'
for i in range(0,7):
     m0=m
     m=m.replace('0','a')
     m=m.replace('1','0')
     m=m.replace('a','1')
     m=m0+m

print(chen_fox_lyndon_factorization(m))

Output:

['011', '01', '0011', '00101101', '0010110011010011', '00101100110100101101001100101101', '001011001101001011010011001011001101001100101101', '001011001101', '001']

RPL

Based on Duval's algorithm. THUEM is defined at Thue-Morse.

Works with: HP version 48G
« DUP SIZE { } → s n res
  « 1
    WHILE DUP n ≤ REPEAT
       DUP 1 + OVER
       WHILE s OVER DUP SUB s 4 PICK DUP SUB DUP2 ≤ 5 PICK n ≤ AND REPEAT
          IF < THEN DROP OVER ELSE 1 + END
          SWAP 1 + SWAP
       END DROP2
       SWAP OVER - SWAP ROT
       WHILE DUP2 ≥ REPEAT
          'res' s 3 PICK DUP 7 PICK + 1 - SUB STO+
          3 PICK +
       END ROT ROT DROP2
     END 
     DROP res
» » 'LYNDON' STO     @ ( "string" → { "f1".."fn" } )

« 2 7 ^ THUEM « →STR + » STREAM
  LYNDON
» 'TASK' STO 
Output:
1: { "011" "01" "0011" "00101101" "0010110011010011" "00101100110100101101001100101101" "001011001101001011010011001011001101001100101101" "001011001101" "001" }

Using local variables

The version below favors local variables to the stack, which makes it a little bit slower - but easier to read.

« DUP SIZE 1 → s n ii
  « { }
    WHILE ii n ≤ REPEAT
       ii DUP 1 + → k j
       « WHILE s k DUP SUB s j DUP SUB DUP2 ≤ ii n ≤ AND REPEAT
            < ii k 1 + IFTE 'k' STO
            'j' 1 STO+
          »
         'j' k STO-
         WHILE ii k ≤ REPEAT
            s ii DUP j + 1 - SUB +
            'ii' j STO+
         END 
       »
     END 
» » 'LYNDON' STO     @ ( "string" → { "f1".."fn" } )

Rust

Translation of: Python
fn chen_fox_lyndon_factorization(s: &str) -> Vec<String> {
    let n = s.len();
    let mut i = 0;
    let mut factorization = Vec::new();
    while i < n {
        let (mut j, mut k) = (i + 1, i);
        while j < n && s.as_bytes()[k] <= s.as_bytes()[j] {
            if s.as_bytes()[k] < s.as_bytes()[j] {
                k = i;
            } else {
                k += 1;
            }
            j += 1;
        }
        while i <= k {
            factorization.push(s[i..i + j - k].to_string());
            i += j - k;
        }
    }
    factorization
}

fn main() {
    let mut m = String::from("0");
    for _ in 0..7 {
        let m0 = m.clone();
        m = m.replace("0", "a");
        m = m.replace("1", "0");
        m = m.replace("a", "1");
        m = m0 + &m;
    }

    let factorization = chen_fox_lyndon_factorization(&m);
    println!("{:?}", factorization);
}
Output:
["011", "01", "0011", "00101101", "0010110011010011", "00101100110100101101001100101101", "001011001101001011010011001011001101001100101101", "001011001101", "001"]

Scala

Translation of: Python
object ChenFoxLyndonFactorization extends App {
  def chenFoxLyndonFactorization(s: String): List[String] = {
    val n = s.length
    var i = 0
    var factorization = List[String]()
    while (i < n) {
      var j = i + 1
      var k = i
      while (j < n && s.charAt(k) <= s.charAt(j)) {
        if (s.charAt(k) < s.charAt(j)) {
          k = i
        } else {
          k += 1
        }
        j += 1
      }
      while (i <= k) {
        factorization = factorization :+ s.substring(i, i + j - k)
        i += j - k
      }
    }
    assert(s == factorization.mkString)
    factorization
  }

  var m = "0"
  for (i <- 0 until 7) {
    val m0 = m
    m = m.replace('0', 'a').replace('1', '0').replace('a', '1')
    m = m0 + m
  }

  println(chenFoxLyndonFactorization(m))
}
Output:
List(011, 01, 0011, 00101101, 0010110011010011, 00101100110100101101001100101101, 001011001101001011010011001011001101001100101101, 001011001101, 001)

SETL

Translation of: C++
program lyndon_factorization;
    tm := "01101001100101101001011001101001100101100110"
        + "10010110100110010110100101100110100101101001"
        + "1001011001101001100101101001011001101001";

    loop for part in duval(tm) do
        print(part);
    end loop;

    proc duval(s);
        i := 1;
        fact := [];
        loop while i <= #s do
            j := i + 1;
            k := i;
            loop while j <= #s and s(k) <= s(j) do
                k := if s(k) < s(j) then i else k+1 end;
                j +:= 1;
            end loop;
            loop while i <= k do
                fact with:= s(i..i+j-k-1);
                i +:= j-k;
            end loop;
        end loop;
        return fact;
    end proc;
end program;
Output:
011
01
0011
00101101
0010110011010011
00101100110100101101001100101101
001011001101001011010011001011001101001100101101
001011001101
001

Wren

Translation of: Python
Library: Wren-str
import "./str" for Str

var duval = Fn.new { |s|
    var n = s.count
    var i = 0
    var factorization = []
    while (i < n) {
        var j = i + 1
        var k = i
        while (j < n && Str.le(s[k], s[j])) {
            if (Str.lt(s[k], s[j])) k = i else k = k + 1
            j = j + 1
        }
        while (i <= k) {
            factorization.add(s[i...i+j-k])
            i = i + j - k
        }
    }
    return factorization
}

// Thue-Morse example
var m = "0"
for (i in 0..6) {
    var m0 = m
    m = m.replace("0", "a")
    m = m.replace("1", "0")
    m = m.replace("a", "1")
    m = m0 + m
}
System.print(duval.call(m).join("\n"))>
Output:
011
01
0011
00101101
0010110011010011
00101100110100101101001100101101
001011001101001011010011001011001101001100101101
001011001101
001
  1. Duval, Jean-Pierre (1983), "Factorizing words over an ordered alphabet", Journal of Algorithms, 4 (4): 363–381, doi:10.1016/0196-6774(83)90017-2
  2. 2.0 2.1 https://cp-algorithms.com/string/lyndon_factorization.html