Natural sorting: Difference between revisions

From Rosetta Code
Content added Content deleted
(jq)
m (→‎{{header|jq}}: {{work with}})
Line 778: Line 778:


=={{header|jq}}==
=={{header|jq}}==
{{works with|jq|version with regex support}}
{{works with|jq|with regex support}}


For brevity, this implementation satisfies the first five requirements. It uses the standard approach for requirements 1, 2,
For brevity, this implementation satisfies the first five requirements. It uses the standard approach for requirements 1, 2,

Revision as of 07:02, 1 May 2015

Task
Natural sorting
You are encouraged to solve this task according to the task description, using any language you may know.

Natural sorting is the sorting of text that does more than rely on the order of individual characters codes to make the finding of individual strings easier for a human reader.

There is no "one true way" to do this, but for the purpose of this task 'natural' orderings might include:

1. Ignore leading, trailing and multiple adjacent spaces
2. Make all whitespace characters equivalent.
3. Sorting without regard to case.
4. Sorting numeric portions of strings in numeric order. That is split the string into fields on numeric boundaries, then sort on each field, with the rightmost fields being the most significant, and numeric fields of integers treated as numbers.
foo9.txt before foo10.txt
As well as ... x9y99 before x9y100, before x10y0
... (for any number of groups of integers in a string).
5. Title sorts: without regard to a leading, very common, word such
as 'The' in "The thirty-nine steps".
6. Sort letters without regard to accents.
7. Sort ligatures as separate letters.
8. Replacements:
Sort german scharfes S (ß) as ss
Sort ſ, LATIN SMALL LETTER LONG S as s
Sort ʒ, LATIN SMALL LETTER EZH as s
...
Task Description
  • Implement the first four of the eight given features in a natural sorting routine/function/method...
  • Test each feature implemented separately with an ordered list of test strings from the 'Sample inputs' section below, and make sure your naturally sorted output is in the same order as other language outputs such as Python.
  • Print and display your output.
  • For extra credit implement more than the first four.

Note: It is not necessary to have individual control of which features are active in the natural sorting routine at any time.

Sample input
# Ignoring leading spaces
Text strings:
['ignore leading spaces: 2-2', ' ignore leading spaces: 2-1', '  ignore leading spaces: 2+0', '   ignore leading spaces: 2+1']

# Ignoring multiple adjacent spaces (m.a.s)
Text strings:
['ignore m.a.s spaces: 2-2', 'ignore m.a.s  spaces: 2-1', 'ignore m.a.s   spaces: 2+0', 'ignore m.a.s    spaces: 2+1']


# Equivalent whitespace characters
Text strings:
['Equiv. spaces: 3-3', 'Equiv.\rspaces: 3-2', 'Equiv.\x0cspaces: 3-1', 'Equiv.\x0bspaces: 3+0', 'Equiv.\nspaces: 3+1', 'Equiv.\tspaces: 3+2']

# Case Indepenent sort
Text strings:
['cASE INDEPENENT: 3-2', 'caSE INDEPENENT: 3-1', 'casE INDEPENENT: 3+0', 'case INDEPENENT: 3+1']

# Numeric fields as numerics
Text strings:
['foo100bar99baz0.txt', 'foo100bar10baz0.txt', 'foo1000bar99baz10.txt', 'foo1000bar99baz9.txt']

# Title sorts
Text strings:
['The Wind in the Willows', 'The 40th step more', 'The 39 steps', 'Wanda']

# Equivalent accented characters (and case)
Text strings:
[u'Equiv. \xfd accents: 2-2', u'Equiv. \xdd accents: 2-1', u'Equiv. y accents: 2+0', u'Equiv. Y accents: 2+1']


# Separated ligatures
Text strings:
[u'\u0132 ligatured ij', 'no ligature']

# Character replacements
Text strings:
[u'Start with an \u0292: 2-2', u'Start with an \u017f: 2-1', u'Start with an \xdf: 2+0', u'Start with an s: 2+1']


C

Some differences from task requirement:

  1. req 1 and 2 are not separated. I can't imagine a situation where I'd want one but not the other.
  2. req 5 is implemented differently: not only leading "the", but some common words like "it", "to" etc are discarded everywhere in the string
  3. req 6, 7, 8 are pretty much the same thing, so the implementation doesn't make too much of a distinction of it ("ß" is a ligature, you know.)

Besides the numeric part, everything else was done in a uniform way by transforming input strings into some normalized format and comparing those instead. All sort options flags can be freely mixed together. C source is written in UTF-8 for easier reading here: don't do this for serious code. <lang c>#include <stdio.h>

  1. include <stdlib.h>
  2. include <wchar.h>
  3. include <wctype.h>
  4. include <string.h>
  5. include <locale.h>

typedef struct wstr { wchar_t *s; int n, alloc; } wstr;

  1. define w_del(w) { free(w->s); free(w); }
  2. define forchars(i, c, w) for(i = 0, c = w->s[0]; i < w->n && c; c = w->s[++i])

wstr *w_new() { wstr *w = malloc(sizeof(wstr)); w->alloc = 1; w->n = 0; w->s = malloc(sizeof(wchar_t)); w->s[0] = 0; return w; }

void w_append(wstr *w, wchar_t c) { int n = w->n + 1; if (n >= w->alloc) { w->alloc *= 2; w->s = realloc(w->s, w->alloc * sizeof(wchar_t)); } w->s[w->n++] = c; w->s[w->n] = 0; }

wstr *w_make(wchar_t *s) { int i, len = wcslen(s); wstr *w = w_new(); for (i = 0; i < len; i++) w_append(w, s[i]); return w; }

typedef void (*wtrans_func)(wstr *, wstr *); void w_transform(wstr *in, wtrans_func f) { wstr t, *out = w_new(); f(in, out); t = *in; *in = *out; *out = t; w_del(out); }

  1. define transfunc(x) void w_##x(wstr *in, wstr *out)

transfunc(nocase) { int i; wchar_t c; forchars(i, c, in) w_append(out, towlower(c)); }

transfunc(despace) { int i, gotspace = 0; wchar_t c; forchars(i, c, in) { if (!iswspace(c)) { if (gotspace && out->n) w_append(out, L' '); w_append(out, c); gotspace = 0; } else gotspace = 1; } }

static const wchar_t *const tbl_accent[] = { /* copied from Perl6 code */ L"Þ", L"TH", L"þ", L"th", L"Ð", L"TH", L"ð", L"th", L"À", L"A", L"Á", L"A", L"Â", L"A", L"Ã", L"A", L"Ä", L"A", L"Å", L"A", L"à", L"a", L"á", L"a", L"â", L"a", L"ã", L"a", L"ä", L"a", L"å", L"a", L"Ç", L"C", L"ç", L"c", L"È", L"E", L"É", L"E", L"Ê", L"E", L"Ë", L"E", L"è", L"e", L"é", L"e", L"ê", L"e", L"ë", L"e", L"Ì", L"I", L"Í", L"I", L"Î", L"I", L"Ï", L"I", L"ì", L"i", L"í", L"i", L"î", L"i", L"ï", L"i", L"Ò", L"O", L"Ó", L"O", L"Ô", L"O", L"Õ", L"O", L"Ö", L"O", L"Ø", L"O", L"ò", L"o", L"ó", L"o", L"ô", L"o", L"õ", L"o", L"ö", L"o", L"ø", L"o", L"Ñ", L"N", L"ñ", L"n", L"Ù", L"U", L"Ú", L"U", L"Û", L"U", L"Ü", L"U", L"ù", L"u", L"ú", L"u", L"û", L"u", L"ü", L"u", L"Ý", L"Y", L"ÿ", L"y", L"ý", L"y" };

static const wchar_t *const tbl_ligature[] = { L"Æ", L"AE", L"æ", L"ae", L"ß", L"ss", L"ffl", L"ffl", L"ffi", L"ffi", L"fi", L"fi", L"ff", L"ff", L"fl", L"fl", L"ſ", L"s", L"ʒ", L"z", L"st", L"st", /* ... come on ... */ };

void w_char_repl(wstr *in, wstr *out, const wchar_t *const *tbl, int len) { int i, j, k; wchar_t c; forchars(i, c, in) { for (j = k = 0; j < len; j += 2) { if (c != tbl[j][0]) continue; for (k = 0; tbl[j + 1][k]; k++) w_append(out, tbl[j + 1][k]); break; } if (!k) w_append(out, c); } }

transfunc(noaccent) { w_char_repl(in, out, tbl_accent, sizeof(tbl_accent)/sizeof(wchar_t*)); }

transfunc(noligature) { w_char_repl(in, out, tbl_ligature, sizeof(tbl_ligature)/sizeof(wchar_t*)); }

static const wchar_t *const tbl_article[] = { L"the", L"a", L"of", L"to", L"is", L"it" };

  1. define N_ARTICLES sizeof(tbl_article)/sizeof(tbl_article[0])

transfunc(noarticle) { int i, j, n; wchar_t c, c0 = 0; forchars(i, c, in) { if (!c0 || (iswalnum(c) && !iswalnum(c0))) { /* word boundary */ for (j = N_ARTICLES - 1; j >= 0; j--) { n = wcslen(tbl_article[j]); if (wcsncasecmp(in->s + i, tbl_article[j], n)) continue; if (iswalnum(in->s[i + n])) continue; i += n; break; } if (j < 0) w_append(out, c); } else w_append(out, c); c0 = c; } }

enum { wi_space = 0, wi_case, wi_accent, wi_lig, wi_article, wi_numeric };

  1. define WS_NOSPACE (1 << wi_space)
  2. define WS_NOCASE (1 << wi_case)
  3. define WS_ACCENT (1 << wi_accent)
  4. define WS_LIGATURE (1 << wi_lig)
  5. define WS_NOARTICLE (1 << wi_article)
  6. define WS_NUMERIC (1 << wi_numeric)

const wtrans_func trans_funcs[] = { w_despace, w_nocase, w_noaccent, w_noligature, w_noarticle, 0 }; const char *const flagnames[] = { "collapse spaces", "case insensitive", "disregard accent", "decompose ligatures", "discard common words", "numeric", };

typedef struct { wchar_t* s; wstr *w; } kw_t; int w_numcmp(const void *a, const void *b) { wchar_t *pa = ((const kw_t*)a)->w->s, *pb = ((const kw_t*)b)->w->s; int sa, sb, ea, eb; while (*pa && *pb) { if (iswdigit(*pa) && iswdigit(*pb)) { /* skip leading zeros */ sa = sb = 0; while (pa[sa] == L'0') sa++; while (pb[sb] == L'0') sb++; /* find end of numbers */ ea = sa; eb = sb; while (iswdigit(pa[ea])) ea++; while (iswdigit(pb[eb])) eb++; if (eb - sb > ea - sa) return -1; if (eb - sb < ea - sa) return 1; while (sb < eb) { if (pa[sa] > pb[sb]) return 1; if (pa[sa] < pb[sb]) return -1; sa++; sb++; }

pa += ea; pb += eb; } else if (iswdigit(*pa)) return 1; else if (iswdigit(*pb)) return -1; else { if (*pa > *pb) return 1; if (*pa < *pb) return -1; pa++; pb++; } } return (!*pa && !*pb) ? 0 : *pa ? 1 : -1; }

int w_cmp(const void *a, const void *b) { return wcscmp(((const kw_t*)a)->w->s, ((const kw_t*)b)->w->s); }

void natural_sort(wchar_t **strings, int len, int flags) { int i, j; kw_t *kws = malloc(sizeof(kw_t) * len);

for (i = 0; i < len; i++) { kws[i].s = strings[i]; kws[i].w = w_make(strings[i]); for (j = 0; j < wi_numeric; j++) if (flags & (1 << j) && trans_funcs[j]) w_transform(kws[i].w, trans_funcs[j]); }

qsort(kws, len, sizeof(kw_t), (flags & WS_NUMERIC) ? w_numcmp : w_cmp); for (i = 0; i < len; i++) { w_del(kws[i].w); strings[i] = kws[i].s; } free(kws); }

const wchar_t *const test[] = { L" 0000098 nina", L"100 niño", L"99 Ninja", L"100 NINA", L" The work is so difficult to do it took ſome 100 aeons. ", L"The work is so difficult it took some 100 aeons.", L" The work is so difficult it took ſome 99 æons. ", };

  1. define N_STRINGS sizeof(test)/sizeof(*test)

void test_sort(int flags) { int i, j; const wchar_t *str[N_STRINGS]; memcpy(str, test, sizeof(test));

printf("Sort flags: ("); for (i = 0, j = flags; j; i++, j >>= 1) if ((j & 1)) printf("%s%s", flagnames[i], j > 1 ? ", ":")\n");

natural_sort((wchar_t **)str, N_STRINGS, flags);

for (i = 0; i < N_STRINGS; i++) printf("%ls\n", str[i]); printf("\n"); }

int main() { setlocale(LC_CTYPE, "");

test_sort(WS_NOSPACE); test_sort(WS_NOCASE); test_sort(WS_NUMERIC); test_sort(WS_NOARTICLE|WS_NOSPACE); test_sort(WS_NOCASE|WS_NOSPACE|WS_ACCENT); test_sort(WS_LIGATURE|WS_NOCASE|WS_NOSPACE|WS_NUMERIC|WS_ACCENT|WS_NOARTICLE);

return 0; }</lang>output<lang>Sort flags: (collapse spaces)

0000098 nina

100 NINA 100 niño 99 Ninja The work is so difficult it took some 100 aeons.

 The work is so difficult   it took ſome 99 æons.  
The work is so difficult to do it took ſome 100 aeons.  

Sort flags: (case insensitive)

 The work is so difficult   it took ſome 99 æons.  
0000098 nina
The work is so difficult to do it took ſome 100 aeons.  

100 NINA 100 niño 99 Ninja The work is so difficult it took some 100 aeons.

Sort flags: (numeric)

 The work is so difficult   it took ſome 99 æons.  
The work is so difficult to do it took ſome 100 aeons.  
0000098 nina

The work is so difficult it took some 100 aeons. 99 Ninja 100 NINA 100 niño

Sort flags: (collapse spaces, discard common words)

0000098 nina

100 NINA 100 niño 99 Ninja The work is so difficult it took some 100 aeons.

The work is so difficult to do it took ſome 100 aeons.  
 The work is so difficult   it took ſome 99 æons.  

Sort flags: (collapse spaces, case insensitive, disregard accent)

0000098 nina

100 NINA 100 niño 99 Ninja The work is so difficult it took some 100 aeons.

 The work is so difficult   it took ſome 99 æons.  
The work is so difficult to do it took ſome 100 aeons.  

Sort flags: (collapse spaces, case insensitive, disregard accent, decompose ligatures, discard common words, numeric)

The work is so difficult to do it took ſome 100 aeons.  
 The work is so difficult   it took ſome 99 æons.  

The work is so difficult it took some 100 aeons.

0000098 nina

99 Ninja 100 NINA 100 niño</lang>

D

Implements requests 1-5. <lang d>import std.stdio, std.string, std.algorithm, std.array, std.conv,

      std.ascii, std.range;

string[] naturalSort(string[] arr) /*pure @safe*/ {

   static struct Part {
       string s;
       int opCmp(in ref Part other) const pure {
           return (s[0].isDigit && other.s[0].isDigit) ?
                  cmp([s.to!ulong], [other.s.to!ulong]) :
                  cmp(s, other.s);
       }
   }
   static mapper(in string txt) /*pure nothrow @safe*/ {
       auto r = txt
                .strip
                .tr(whitespace, " ", "s")
                .toLower
                .groupBy!isDigit
                .map!(p => Part(p.text))
                .array;
       return (r.length > 1 && r[0].s == "the") ? r.dropOne : r;
   }
   return arr.schwartzSort!mapper.release;

}

void main() /*@safe*/ {

   auto tests = [
   // Ignoring leading spaces.
   ["ignore leading spaces: 2-2", " ignore leading spaces: 2-1", "
    ignore leading spaces: 2+1", "  ignore leading spaces: 2+0"],
   // Ignoring multiple adjacent spaces (m.a.s).
   ["ignore m.a.s spaces: 2-2", "ignore m.a.s  spaces: 2-1",
    "ignore m.a.s   spaces: 2+0", "ignore m.a.s    spaces: 2+1"],
   // Equivalent whitespace characters.
   ["Equiv. spaces: 3-3", "Equiv.\rspaces: 3-2",
    "Equiv.\x0cspaces: 3-1", "Equiv.\x0bspaces: 3+0",
    "Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2"],
   // Case Indepenent sort.
   ["cASE INDEPENENT: 3-2", "caSE INDEPENENT: 3-1",
    "casE INDEPENENT: 3+0", "case INDEPENENT: 3+1"],
   // Numeric fields as numerics.
   ["foo100bar99baz0.txt", "foo100bar10baz0.txt",
    "foo1000bar99baz10.txt", "foo1000bar99baz9.txt"],
   // Title sorts.
   ["The Wind in the Willows", "The 40th step more",
    "The 39 steps", "Wanda"]];
   foreach (test; tests)
       writeln(test, "\n", test.naturalSort, "\n");

}</lang>

Output:
["ignore leading spaces: 2-2", " ignore leading spaces: 2-1", "   ignore leading spaces: 2+1", "  ignore leading spaces: 2+0"]
["  ignore leading spaces: 2+0", "   ignore leading spaces: 2+1", " ignore leading spaces: 2-1", "ignore leading spaces: 2-2"]

["ignore m.a.s spaces: 2-2", "ignore m.a.s  spaces: 2-1", "ignore m.a.s   spaces: 2+0", "ignore m.a.s    spaces: 2+1"]
["ignore m.a.s   spaces: 2+0", "ignore m.a.s    spaces: 2+1", "ignore m.a.s  spaces: 2-1", "ignore m.a.s spaces: 2-2"]

["Equiv. spaces: 3-3", "Equiv.\rspaces: 3-2", "Equiv.\fspaces: 3-1", "Equiv.\vspaces: 3+0", "Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2"]
["Equiv.\vspaces: 3+0", "Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2", "Equiv.\fspaces: 3-1", "Equiv.\rspaces: 3-2", "Equiv. spaces: 3-3"]

["cASE INDEPENENT: 3-2", "caSE INDEPENENT: 3-1", "casE INDEPENENT: 3+0", "case INDEPENENT: 3+1"]
["casE INDEPENENT: 3+0", "case INDEPENENT: 3+1", "caSE INDEPENENT: 3-1", "cASE INDEPENENT: 3-2"]

["foo100bar99baz0.txt", "foo100bar10baz0.txt", "foo1000bar99baz10.txt", "foo1000bar99baz9.txt"]
["foo100bar10baz0.txt", "foo100bar99baz0.txt", "foo1000bar99baz9.txt", "foo1000bar99baz10.txt"]

["The Wind in the Willows", "The 40th step more", "The 39 steps", "Wanda"]
["The 39 steps", "The 40th step more", "The Wind in the Willows", "Wanda"]

Go

This solution varies from the task in interpretation of rule 4, describing numeric fields. This solution follows other solutions on the page by treating the left-most fields as most significant. See talk page.

First four rules, no extra credit: <lang go>package main

import (

   "fmt"
   "regexp"
   "sort"
   "strconv"
   "strings"

)

var tests = []struct {

   descr string
   list  []string

}{

   {"Ignoring leading spaces", []string{
       "ignore leading spaces: 2-2",
       " ignore leading spaces: 2-1",
       "  ignore leading spaces: 2+0",
       "   ignore leading spaces: 2+1",
   }},
   {"Ignoring multiple adjacent spaces", []string{
       "ignore m.a.s spaces: 2-2",
       "ignore m.a.s  spaces: 2-1",
       "ignore m.a.s   spaces: 2+0",
       "ignore m.a.s    spaces: 2+1",
   }},
   {"Equivalent whitespace characters", []string{
       "Equiv. spaces: 3-3",
       "Equiv.\rspaces: 3-2",
       "Equiv.\fspaces: 3-1",
       "Equiv.\bspaces: 3+0",
       "Equiv.\nspaces: 3+1",
       "Equiv.\tspaces: 3+2",
   }},
   {"Case Indepenent sort", []string{
       "cASE INDEPENENT: 3-2",
       "caSE INDEPENENT: 3-1",
       "casE INDEPENENT: 3+0",
       "case INDEPENENT: 3+1",
   }},
   {"Numeric fields as numerics", []string{
       "foo100bar99baz0.txt",
       "foo100bar10baz0.txt",
       "foo1000bar99baz10.txt",
       "foo1000bar99baz9.txt",
   }},

}

func main() {

   for _, test := range tests {
       fmt.Println(test.descr)
       fmt.Println("Input order:")
       for _, s := range test.list {
           fmt.Printf("   %q\n", s)
       }
       fmt.Println("Natural order:")
       l := make(list, len(test.list))
       for i, s := range test.list {
           l[i] = newNatStr(s)
       }
       sort.Sort(l)
       for _, s := range l {
           fmt.Printf("   %q\n", s.s)
       }
       fmt.Println()
   }

}

// natStr associates a string with a preprocessed form type natStr struct {

   s string // original
   t []tok  // preprocessed "sub-fields"

}

func newNatStr(s string) (t natStr) {

   t.s = s
   s = strings.ToLower(strings.Join(strings.Fields(s), " "))
   x := dx.FindAllString(s, -1)
   t.t = make([]tok, len(x))
   for i, s := range x {
       if n, err := strconv.Atoi(s); err == nil {
           t.t[i].n = n
       } else {
           t.t[i].s = s
       }
   }
   return t

}

var dx = regexp.MustCompile(`\d+|\D+`)

// rule is to use s unless it is empty, then use n type tok struct {

   s string
   n int

}

// rule 2 of "numeric sub-fields" from talk page func (f1 tok) Cmp(f2 tok) int {

   switch {
   case f1.s == "":
       switch {
       case f2.s > "" || f1.n < f2.n:
           return -1
       case f1.n > f2.n:
           return 1
       }
   case f2.s == "" || f1.s > f2.s:
       return 1
   case f1.s < f2.s:
       return -1
   }
   return 0

}

type list []natStr

func (l list) Len() int { return len(l) } func (l list) Swap(i, j int) { l[i], l[j] = l[j], l[i] } func (l list) Less(i, j int) bool {

   ti := l[i].t
   for k, t := range l[j].t {
       if k == len(ti) {
           return true
       }
       switch ti[k].Cmp(t) {
       case -1:
           return true
       case 1:
           return false
       }
   }
   return false

}</lang>

Output:
Ignoring leading spaces
Input order:
   "ignore leading spaces: 2-2"
   " ignore leading spaces: 2-1"
   "  ignore leading spaces: 2+0"
   "   ignore leading spaces: 2+1"
Natural order:
   "  ignore leading spaces: 2+0"
   "   ignore leading spaces: 2+1"
   " ignore leading spaces: 2-1"
   "ignore leading spaces: 2-2"

Ignoring multiple adjacent spaces
Input order:
   "ignore m.a.s spaces: 2-2"
   "ignore m.a.s  spaces: 2-1"
   "ignore m.a.s   spaces: 2+0"
   "ignore m.a.s    spaces: 2+1"
Natural order:
   "ignore m.a.s   spaces: 2+0"
   "ignore m.a.s    spaces: 2+1"
   "ignore m.a.s  spaces: 2-1"
   "ignore m.a.s spaces: 2-2"

Equivalent whitespace characters
Input order:
   "Equiv. spaces: 3-3"
   "Equiv.\rspaces: 3-2"
   "Equiv.\fspaces: 3-1"
   "Equiv.\bspaces: 3+0"
   "Equiv.\nspaces: 3+1"
   "Equiv.\tspaces: 3+2"
Natural order:
   "Equiv.\bspaces: 3+0"
   "Equiv.\nspaces: 3+1"
   "Equiv.\tspaces: 3+2"
   "Equiv.\fspaces: 3-1"
   "Equiv.\rspaces: 3-2"
   "Equiv. spaces: 3-3"

Case Indepenent sort
Input order:
   "cASE INDEPENENT: 3-2"
   "caSE INDEPENENT: 3-1"
   "casE INDEPENENT: 3+0"
   "case INDEPENENT: 3+1"
Natural order:
   "casE INDEPENENT: 3+0"
   "case INDEPENENT: 3+1"
   "caSE INDEPENENT: 3-1"
   "cASE INDEPENENT: 3-2"

Numeric fields as numerics
Input order:
   "foo100bar99baz0.txt"
   "foo100bar10baz0.txt"
   "foo1000bar99baz10.txt"
   "foo1000bar99baz9.txt"
Natural order:
   "foo100bar10baz0.txt"
   "foo100bar99baz0.txt"
   "foo1000bar99baz9.txt"
   "foo1000bar99baz10.txt"

J

The natural way of approaching this task in J is to normalize the text based on the rules desired for sorting. Here, we limit ourselves to ascii, for portability, and decide that our domain shall be terminated strings (where each string ends with the same character - typically a newline):

<lang j>require'strings regex'

lines=: <;.2 titleFix=: ('^\s*(the|a|an)\b';)&rxrplc isNum=: e.&'0123456789' num=: ".^:(isNum@{.) split=: <@num/.~ [:+/\1,2 ~:/\ isNum norm=: [: split (32 9 12 13 14 15{a.) -.~ [: titleFix tolower

natSor=: lines ;@/: norm&.>@lines</lang>

Example data:

<lang j>IgnoringLeadingSpaces=:0 :0 ignore leading spaces: 2-2

ignore leading spaces: 2-1
 ignore leading spaces: 2+0
  ignore leading spaces: 2+1

)

IgnoringMultipleAdjacentSpaces=: 0 :0

ignore m.a.s spaces: 2-2
ignore m.a.s  spaces: 2-1
ignore m.a.s   spaces: 2+0
ignore m.a.s    spaces: 2+1

)

bsSubst=: rplc&((LF;'\'),('\r';13{a.),('\x0c';12{a.),('\x0b';11{a.),('\n';LF),:'\t';TAB)

EquivalentWhitespaceCharacters=: bsSubst 0 :0

Equiv. spaces: 3-3
Equiv.\rspaces: 3-2
Equiv.\x0cspaces: 3-1
Equiv.\x0bspaces: 3+0
Equiv.\nspaces: 3+1
Equiv.\tspaces: 3+2

)

CaseIndepenent=: 0 :0

cASE INDEPENENT: 3-2
caSE INDEPENENT: 3-1
casE INDEPENENT: 3+0
case INDEPENENT: 3+1

)

NumericFieldsAsNumerics=: 0 :0

foo100bar99baz0.txt
foo100bar10baz0.txt
foo1000bar99baz10.txt
foo1000bar99baz9.txt

)

Titles=: 0 :0

The Wind in the Willows
The 40th step more
The 39 steps
Wanda

)</lang>

Note that the required example which contains equivalent whitespace characters includes a '\n' in the data. So, for that example, we use a backslash as our terminator.

Example use:

<lang j> natSor IgnoringLeadingSpaces

 ignore leading spaces: 2+0
  ignore leading spaces: 2+1
ignore leading spaces: 2-1

ignore leading spaces: 2-2

  natSor IgnoringMultipleAdjacentSpaces
ignore m.a.s   spaces: 2+0
ignore m.a.s    spaces: 2+1
ignore m.a.s  spaces: 2-1
ignore m.a.s spaces: 2-2
  natSor EquivalentWhitespaceCharacters
Equiv.

spaces: 3+1\ Equiv.�spaces: 3+0\ Equiv. spaces: 3+2\ Equiv.�spaces: 3-1\ Equiv. spaces: 3-2\ Equiv. spaces: 3-3\

  natSor CaseIndepenent
casE INDEPENENT: 3+0
case INDEPENENT: 3+1
caSE INDEPENENT: 3-1
cASE INDEPENENT: 3-2
  natSor NumericFieldsAsNumerics
foo100bar10baz0.txt
foo100bar99baz0.txt
foo1000bar99baz9.txt
foo1000bar99baz10.txt
  natSor Titles
The 39 steps
The 40th step more
Wanda
The Wind in the Willows

</lang>

jq

Works with: jq version with regex support

For brevity, this implementation satisfies the first five requirements. It uses the standard approach for requirements 1, 2, 3, and 5, and addresses the fourth requirement by exploiting the ability of jq's builtin "sort" to sort arrays lexicographically. The essence of the matter therefore comes down to the filter named "splitup", which for clarity, we define here as a top-level function, as follows: <lang jq>def splitup:

 def tidy:
   if .[0] == "" then .[1:] else . end 
   | if .[length-1] == "" then .[0:length-1] else . end ;
 # a and b are assumed to be arrays:
 def alternate(a;b):
   reduce range(0; [a,b] | map(length) | max) as $i ([]; . + [a[$i], b[$i]]);
 ([splits("[0-9]+")] | tidy) as $a
 | ([splits("[^0-9]+")] | tidy | map(tonumber)) as $n
 | (test("^[0-9]")) as $nfirst
 | if $nfirst then alternate($n; $a) else alternate($a; $n) end ;

def natural_sort:

 def naturally:
   ascii_downcase
   | sub("^(the|a|an) "; "")  # leading the/a/an (as words)
   | gsub("\\p{Cc}+";" ")     # control characters
   | gsub("\\s+"; " ")        # whitespace
   | sub("^ *";"")            # leading whitespace
   | sub(" *$";"")            # trailing whitespace
   | splitup;                 # embedded integers
 sort_by(naturally);</lang>

Testing For simplicity, we use the input for the first four tasks as given above, but modified slightly so that it can be read n as valid JSON. In particular, the comments have been quoted. The test driver can be written as a one-liner: <lang jq>if type == "string" then "", . else natural_sort end</lang?

Output:

<lang sh>jq -r -f Natural_sorting.jq Natural_sorting.json

  1. Ignoring leading spaces

[

 "  ignore leading spaces: 2+0",
 "   ignore leading spaces: 2+1",
 " ignore leading spaces: 2-1",
 "ignore leading spaces: 2-2"

]

  1. Ignoring multiple adjacent spaces (m.a.s)

[

 "ignore m.a.s   spaces: 2+0",
 "ignore m.a.s    spaces: 2+1",
 "ignore m.a.s  spaces: 2-1",
 "ignore m.a.s spaces: 2-2"

]

  1. Equivalent whitespace characters

[

 "Equiv.\u000bspaces: 3+0",
 "Equiv.\nspaces: 3+1",
 "Equiv.\tspaces: 3+2",
 "Equiv.\fspaces: 3-1",
 "Equiv.\rspaces: 3-2",
 "Equiv. spaces: 3-3"

]

  1. Case Indepenent sort

[

 "casE INDEPENENT: 3+0",
 "case INDEPENENT: 3+1",
 "caSE INDEPENENT: 3-1",
 "cASE INDEPENENT: 3-2"

]

  1. Numeric fields as numerics

[

 "foo100bar10baz0.txt",
 "foo100bar99baz0.txt",
 "foo1000bar99baz9.txt",
 "foo1000bar99baz10.txt"

]

  1. Title sorts

[

 "The 39 steps",
 "The 40th step more",
 "Wanda",
 "The Wind in the Willows"

]</lang>

Perl

This implements all 8 requirements*:

<lang perl> use feature 'fc'; use Unicode::Normalize;

sub natural_sort {

   my @items = map {
       my $str = fc(NFKD($_));
       $str =~ s/\s+/ /;
       $str =~ s/|^(?:the|a|an) \b|\p{Nonspacing_Mark}| $//g;
       my @fields = $str =~ /(?!\z) ([^0-9]*+) ([0-9]*+)/gx;
       [$_, \@fields]
   } @_;
   return map { $_->[0] } sort {
       my @x = @{$a->[1]};
       my @y = @{$b->[1]};
       my $numeric;
       while (@x && @y) {
           my ($x, $y) = (shift @x, shift @y);
           return (($numeric = !$numeric) ? $x cmp $y : $x <=> $y or next);
       }
       return @x <=> @y;
   } @items;

} </lang>

*) Note that decomposing the strings to the NFKD normalization form and subsequently stripping off all code points of the Nonspacing_Mark category, removes differences caused by accents / ligatures / alternate character forms / etc. in a standards-compliant way. This coincides with all the examples given in the task description, with the exception that it does not replace "ʒ" with "s" — one could add
$str =~ tr/ʒ/s/;
for that but it seems a bit whimsical.)

Testing:

<lang perl> use utf8; # interpret this script's source code as UTF8 use Test::More; # for plan(), is_deeply() use Data::Dump; # for dd()

my @testcases = (

   ['Leading spaces',   '%sleading spaces: %i',  map {' ' x $_} 2, 3, 1, 0             ],
   ['Adjacent spaces',  'adjacent%s spaces: %i', map {' ' x $_} 2, 3, 1, 0             ],
   ['Different spaces', 'equiv.%sspaces: %i',    split //, "\x0b\n\t\x0c\r "           ],
   ['Case differences', '%s INDEPENENT: %i',     'casE', 'case', 'caSE', 'cASE'        ],
   ['Numeric fields',   'foo%ibar%ibaz%i.txt',   [100, 10, 0], [100, 99, 0],
                                                 [1000,99,9], [1000,99,10]             ],
   ['Title case',       '%s',                    'The 39 steps', 'The 40th step more',
                                                 'Wanda', 'The Wind in the Willows'    ],
   ['Accents',          'Equiv. %s accents: %i', 'y', 'Y', "\x{dd}", "\x{fd}"          ],
   ['Ligatures',        '%s',                    "IJ ligatured ij", 'no ligature'       ],
   ['Alternate forms',  'Start with an %s: %i',  's', 'ſ', 'ß'                         ],

);

plan tests => scalar @testcases;

foreach (@testcases) {

   my ($name, $pattern, @args) = @$_;
   my $i = 0;
   my @strings = map { sprintf $pattern, ref $_ ? @$_ : $_, $i++ } @args;
   
   is_deeply( [natural_sort(reverse sort @strings)], \@strings, $name );
   
   dd @strings;
   print "\n";

} </lang>

Output:
1..9
ok 1 - Leading spaces
(
  "  leading spaces: 0",
  "   leading spaces: 1",
  " leading spaces: 2",
  "leading spaces: 3",
)

ok 2 - Adjacent spaces
(
  "adjacent   spaces: 0",
  "adjacent    spaces: 1",
  "adjacent  spaces: 2",
  "adjacent spaces: 3",
)

ok 3 - Different spaces
(
  "equiv.\13spaces: 0",
  "equiv.\nspaces: 1",
  "equiv.\tspaces: 2",
  "equiv.\fspaces: 3",
  "equiv.\rspaces: 4",
  "equiv. spaces: 5",
)

ok 4 - Case differences
(
  "casE INDEPENENT: 0",
  "case INDEPENENT: 1",
  "caSE INDEPENENT: 2",
  "cASE INDEPENENT: 3",
)

ok 5 - Numeric fields
(
  "foo100bar10baz0.txt",
  "foo100bar99baz0.txt",
  "foo1000bar99baz9.txt",
  "foo1000bar99baz10.txt",
)

ok 6 - Title case
(
  "The 39 steps",
  "The 40th step more",
  "Wanda",
  "The Wind in the Willows",
)

ok 7 - Accents
(
  "Equiv. y accents: 0",
  "Equiv. Y accents: 1",
  "Equiv. \xDD accents: 2",
  "Equiv. \xFD accents: 3",
)

ok 8 - Ligatures
("\x{132} ligatured ij", "no ligature")

ok 9 - Alternate forms
(
  "Start with an s: 0",
  "Start with an \x{17F}: 1",
  "Start with an \xDF: 2",
)

Perl 6

Tested with Rakudo 2011.06

In Perl6 it is very easy to modify the default sorting order by passing in a transform routine to the sort function. If the transform routines are arity one, the sort function will apply a Schwartzian Transform so it only needs to calculate the transform once. Note that the transforms are non-destructive; The sort function returns the original strings.

The following are a series of subroutines to perform the various natural sorting transforms. They may be applied individually or mixed and matched to get the particular result desired. When more than one is strung together, they apply left to right. Some combinations may yield different results depending on the order they are applied.

<lang perl6># Sort groups of digits in number order. Sort by order of magnitude then lexically. sub naturally ($a) { $a.lc.subst(/(\d+)/, ->$/ {0~$0.chars.chr~$0},:g) ~"\x0"~$a }

  1. Collapse multiple ws characters to a single.

sub collapse ($a) { $a.subst( / ( \s ) $0+ /, -> $/ { $0 }, :g ) }

  1. Convert all ws characters to a space.

sub normalize ($a) { $a.subst( / ( \s ) /, ' ', :g ) }

  1. Ignore common leading articles for title sorts

sub title ($a) { $a.subst( / :i ^ ( a | an | the ) >> \s* /, ) }

  1. Decompose ISO-Latin1 glyphs to their base character.

sub latin1_decompose ($a) {

   my %tr = < 
      Æ AE æ ae Þ TH þ th Ð TH ð th ß ss À A Á A Â A Ã A Ä A Å A à a á a
       â a ã a ä a å a Ç C ç c È E É E Ê E Ë E è e é e ê e ë e Ì I Í I Î
       I Ï I ì i í i î i ï i Ò O Ó O Ô O Õ O Ö O Ø O ò o ó o ô o õ o ö o
       ø o Ñ N ñ n Ù U Ú U Û U Ü U ù u ú u û u ü u Ý Y ÿ y ý y
   >;
  1. Would probably be better implemented as
  2. $a.trans( [%tr.keys] => [%tr.values] );
  3. but the underlying Parrot VM leaks through in current Rakudo.
   my $re = '<[' ~ %tr.keys.join('|') ~ ']>';
   $a.subst(/ (<$re>) /, -> $/ { %tr{$0} }, :g)

}</lang>


Used as:

<lang perl6>my @tests = (

   [
       "Task 1a\nSort while ignoring leading spaces.",
       [
         'ignore leading spaces: 1', '   ignore leading spaces: 4',
         '  ignore leading spaces: 3', ' ignore leading spaces: 2'
       ],
       {.trim} # builtin method.
   ],
   [
       "Task 1b\nSort while ignoring multiple adjacent spaces.",
       [
         'ignore m.a.s   spaces: 3', 'ignore m.a.s spaces: 1',
         'ignore m.a.s    spaces: 4', 'ignore m.a.s  spaces: 2'
       ],
       {.&collapse}
   ],
   [
       "Task 2\nSort with all white space normalized to regular spaces.",
       [
         "Normalized\tspaces: 4", "Normalized\xa0spaces: 1",
         "Normalized\x20spaces: 2", "Normalized\nspaces: 3"
       ],
       {.&normalize}
   ],
   [
       "Task 3\nSort case independently.",
       [
         'caSE INDEPENDENT: 3', 'casE INDEPENDENT: 2',
         'cASE INDEPENDENT: 4', 'case INDEPENDENT: 1'
       ],
       {.lc} # builtin method
   ],
   [
       "Task 4\nSort groups of digits in natural number order.",
       [
         <Foo100bar99baz0.txt foo100bar10baz0.txt foo1000bar99baz10.txt
          foo1000bar99baz9.txt 201st 32nd 3rd 144th 17th 2 95>
       ],
       {.&naturally}
   ],
   [
       "Task 5 ( mixed with 1, 2, 3 & 4 )\n"
       ~ "Sort titles, normalize white space, collapse multiple spaces to\n"
       ~ "single, trim leading white space, ignore common leading articles\n"
       ~ 'and sort digit groups in natural order.',
       [
         'The Wind	in the Willows  8', '  The 39 Steps               3',
         'The    7th Seal              1', 'Wanda                        6',
         'A Fish Called Wanda          5', ' The Wind and the Lion       7',
         'Any Which Way But Loose      4', '12 Monkeys                   2'
       ],
       {.&normalize.&collapse.trim.&title.&naturally}
   ],
   [
       "Task 6, 7, 8\nMap letters in Latin1 that have accents or decompose to two\n"
       ~ 'characters to their base characters for sorting.',
       [
         <apple Ball bald car Card above Æon æon aether
           niño nina e-mail Évian evoke außen autumn>
       ],
       {.&latin1_decompose.&naturally}
   ]

);


for @tests -> $case {

   my $code_ref = $case.pop;
   my @array = $case.pop.list;
   say $case.pop, "\n";
   say "Standard Sort:\n";
   .say for @array.sort;
   say "\nNatural Sort:\n";
   .say for @array.sort: {.$code_ref};
   say "\n" ~ '*' x 40 ~ "\n";

}</lang>

Sample output:

Task 1a
Sort while ignoring leading spaces.

Standard Sort:

   ignore leading spaces: 4
  ignore leading spaces: 3
 ignore leading spaces: 2
ignore leading spaces: 1

Natural Sort:

ignore leading spaces: 1
 ignore leading spaces: 2
  ignore leading spaces: 3
   ignore leading spaces: 4

****************************************

Task 1b
Sort while ignoring multiple adjacent spaces.

Standard Sort:

ignore m.a.s    spaces: 4
ignore m.a.s   spaces: 3
ignore m.a.s  spaces: 2
ignore m.a.s spaces: 1

Natural Sort:

ignore m.a.s spaces: 1
ignore m.a.s  spaces: 2
ignore m.a.s   spaces: 3
ignore m.a.s    spaces: 4

****************************************

Task 2
Sort with all white space normalized to regular spaces.

Standard Sort:

Normalized	spaces: 4
Normalized
spaces: 3
Normalized spaces: 2
Normalized spaces: 1

Natural Sort:

Normalized spaces: 1
Normalized spaces: 2
Normalized
spaces: 3
Normalized	spaces: 4

****************************************

Task 3
Sort case independently.

Standard Sort:

cASE INDEPENDENT: 4
caSE INDEPENDENT: 3
casE INDEPENDENT: 2
case INDEPENDENT: 1

Natural Sort:

case INDEPENDENT: 1
casE INDEPENDENT: 2
caSE INDEPENDENT: 3
cASE INDEPENDENT: 4

****************************************

Task 4
Sort groups of digits in natural number order.

Standard Sort:

144th
17th
2
201st
32nd
3rd
95
Foo100bar99baz0.txt
foo1000bar99baz10.txt
foo1000bar99baz9.txt
foo100bar10baz0.txt

Natural Sort:

2
3rd
17th
32nd
95
144th
201st
foo100bar10baz0.txt
Foo100bar99baz0.txt
foo1000bar99baz9.txt
foo1000bar99baz10.txt

****************************************

Task 5 ( mixed with 1, 2, 3 & 4 )
Sort titles, normalize white space, collapse multiple spaces to
single, trim leading white space, ignore common leading articles
and sort digit groups in natural order.

Standard Sort:

  The 39 Steps               3
 The Wind and the Lion       7
12 Monkeys                   2
A Fish Called Wanda          5
Any Which Way But Loose      4
The    7th Seal              1
The Wind	in the Willows  8
Wanda                        6

Natural Sort:

The    7th Seal              1
12 Monkeys                   2
  The 39 Steps               3
Any Which Way But Loose      4
A Fish Called Wanda          5
Wanda                        6
 The Wind and the Lion       7
The Wind	in the Willows  8

****************************************

Task 6, 7, 8
Map letters in Latin1 that have accents or decompose to two
characters to their base characters for sorting.

Standard Sort:

Ball
Card
above
aether
apple
autumn
außen
bald
car
e-mail
evoke
nina
niño
Æon
Évian
æon

Natural Sort:

above
Æon
æon
aether
apple
außen
autumn
bald
Ball
car
Card
e-mail
Évian
evoke
nina
niño

****************************************

PicoLisp

This parser takes care of features 1,2,3,4,5 and 8: <lang PicoLisp>(de parseNatural (Str)

  (clip
     (make
        (for (L (chop Str)  L)
           (cond
              ((sp? (car L))
                 (link " ")
                 (while (and L (sp? (car L)))
                    (pop 'L) ) )
              ((>= "9" (car L) "0")
                 (link
                    (format
                       (make
                          (loop
                             (link (pop 'L))
                             (NIL (>= "9" (car L) "0")) ) ) ) ) )
              (T
                 (let Word
                    (pack
                       (replace
                          (make
                             (loop
                                (link (lowc (pop 'L)))
                                (NIL L)
                                (T (sp? (car L)))
                                (T (>= "9" (car L) "0")) ) )
                           "ß" "ss" "ſ" "s" "ʒ" "s" ) )
                    (unless (member Word '(the it to))
                       (link Word) ) ) ) ) ) ) ) )</lang>

Test: <lang PicoLisp>: (parseNatural " ^MThe abc123Defß ^I Ghi ") -> ("abc" 123 "defss" " " "ghi")</lang> Sorting is trivial then: <lang PicoLisp>(de naturalSort (Lst)

  (by parseNatural sort Lst) )</lang>

Test: <lang PicoLisp>(de *TestData

  "# Ignoring leading spaces"
  ("ignore leading spaces: 2-2" " ignore leading spaces: 2-1" "  ignore leading spaces: 2+0" "   ignore leading spaces: 2+1")
  "# Ignoring multiple adjacent spaces (m.a.s)"
  ("ignore m.a.s spaces: 2-2" "ignore m.a.s  spaces: 2-1" "ignore m.a.s   spaces: 2+0" "ignore m.a.s    spaces: 2+1")
  "# Equivalent whitespace characters"
  ("Equiv. spaces: 3-3" "Equiv.^Mspaces: 3-2" "Equiv.^Acspaces: 3-1" "Equiv.^Kbspaces: 3+0" "Equiv.^Jspaces: 3+1" "Equiv.^Ispaces: 3+2")
  "# Case Indepenent sort"
  ("cASE INDEPENENT: 3-2" "caSE INDEPENENT: 3-1" "casE INDEPENENT: 3+0" "case INDEPENENT: 3+1")
  "# Numeric fields as numerics"
  ("foo100bar99baz0.txt" "foo100bar10baz0.txt" "foo1000bar99baz10.txt" "foo1000bar99baz9.txt")
  "# Title sorts"
  ("The Wind in the Willows" "The 40th step more" "The 39 steps" "Wanda")
  "# Equivalent accented characters (and case)"
  ("Equiv. ý accents: 2-2" "Equiv. Ý accents: 2-1" "Equiv. y accents: 2+0" "Equiv. Y accents: 2+1")
  # "Separated ligatures"
  ### ("IJ ligatured ij" "no ligature")
  "# Character replacements"
  ("Start with an ʒ: 2-2" "Start with an ſ: 2-1" "Start with an ß: 2+0" "Start with an s: 2+1") )

(de pythonOut (Ttl Lst)

  (prinl Ttl)
  (prin "['" (car Lst))
  (for S (cdr Lst)
     (prin "',^J '" S) )
  (prinl "']") )

(for X *TestData

  (if (atom X)
     (prinl X)
     (pythonOut "Text strings:" X)
     (pythonOut "Normally sorted :" (sort (copy X)))
     (pythonOut "Naturally sorted:" (naturalSort X))
     (prinl) ) )</lang>

Output:

# Ignoring leading spaces
Text strings:
['ignore leading spaces: 2-2',
 ' ignore leading spaces: 2-1',
 '  ignore leading spaces: 2+0',
 '   ignore leading spaces: 2+1']
Normally sorted :
['   ignore leading spaces: 2+1',
 '  ignore leading spaces: 2+0',
 ' ignore leading spaces: 2-1',
 'ignore leading spaces: 2-2']
Naturally sorted:
['  ignore leading spaces: 2+0',
 '   ignore leading spaces: 2+1',
 ' ignore leading spaces: 2-1',
 'ignore leading spaces: 2-2']

# Ignoring multiple adjacent spaces (m.a.s)
Text strings:
['ignore m.a.s spaces: 2-2',
 'ignore m.a.s  spaces: 2-1',
 'ignore m.a.s   spaces: 2+0',
 'ignore m.a.s    spaces: 2+1']
Normally sorted :
['ignore m.a.s    spaces: 2+1',
 'ignore m.a.s   spaces: 2+0',
 'ignore m.a.s  spaces: 2-1',
 'ignore m.a.s spaces: 2-2']
Naturally sorted:
['ignore m.a.s   spaces: 2+0',
 'ignore m.a.s    spaces: 2+1',
 'ignore m.a.s  spaces: 2-1',
 'ignore m.a.s spaces: 2-2']

# Equivalent whitespace characters
Text strings:
['Equiv. spaces: 3-3',
 'Equiv.
spaces: 3-2',
 'Equiv.�cspaces: 3-1',
 'Equiv.�bspaces: 3+0',
 'Equiv.
spaces: 3+1',
 'Equiv.	spaces: 3+2']
Normally sorted :
['Equiv.�cspaces: 3-1',
 'Equiv.	spaces: 3+2',
 'Equiv.
spaces: 3+1',
 'Equiv.�bspaces: 3+0',
 'Equiv.
spaces: 3-2',
 'Equiv. spaces: 3-3']
Naturally sorted:
['Equiv.�bspaces: 3+0',
 'Equiv.�cspaces: 3-1',
 'Equiv.
spaces: 3+1',
 'Equiv.	spaces: 3+2',
 'Equiv.
spaces: 3-2',
 'Equiv. spaces: 3-3']

# Case Indepenent sort
Text strings:
['cASE INDEPENENT: 3-2',
 'caSE INDEPENENT: 3-1',
 'casE INDEPENENT: 3+0',
 'case INDEPENENT: 3+1']
Normally sorted :
['cASE INDEPENENT: 3-2',
 'caSE INDEPENENT: 3-1',
 'casE INDEPENENT: 3+0',
 'case INDEPENENT: 3+1']
Naturally sorted:
['casE INDEPENENT: 3+0',
 'case INDEPENENT: 3+1',
 'caSE INDEPENENT: 3-1',
 'cASE INDEPENENT: 3-2']

# Numeric fields as numerics
Text strings:
['foo100bar99baz0.txt',
 'foo100bar10baz0.txt',
 'foo1000bar99baz10.txt',
 'foo1000bar99baz9.txt']
Normally sorted :
['foo1000bar99baz10.txt',
 'foo1000bar99baz9.txt',
 'foo100bar10baz0.txt',
 'foo100bar99baz0.txt']
Naturally sorted:
['foo100bar10baz0.txt',
 'foo100bar99baz0.txt',
 'foo1000bar99baz9.txt',
 'foo1000bar99baz10.txt']

# Title sorts
Text strings:
['The Wind in the Willows',
 'The 40th step more',
 'The 39 steps',
 'Wanda']
Normally sorted :
['The 39 steps',
 'The 40th step more',
 'The Wind in the Willows',
 'Wanda']
Naturally sorted:
['The 39 steps',
 'The 40th step more',
 'Wanda',
 'The Wind in the Willows']

# Equivalent accented characters (and case)
Text strings:
['Equiv. ý accents: 2-2',
 'Equiv. Ý accents: 2-1',
 'Equiv. y accents: 2+0',
 'Equiv. Y accents: 2+1']
Normally sorted :
['Equiv. Y accents: 2+1',
 'Equiv. y accents: 2+0',
 'Equiv. Ý accents: 2-1',
 'Equiv. ý accents: 2-2']
Naturally sorted:
['Equiv. y accents: 2+0',
 'Equiv. Y accents: 2+1',
 'Equiv. Ý accents: 2-1',
 'Equiv. ý accents: 2-2']

# Character replacements
Text strings:
['Start with an ʒ: 2-2',
 'Start with an ſ: 2-1',
 'Start with an ß: 2+0',
 'Start with an s: 2+1']
Normally sorted :
['Start with an s: 2+1',
 'Start with an ß: 2+0',
 'Start with an ſ: 2-1',
 'Start with an ʒ: 2-2']
Naturally sorted:
['Start with an s: 2+1',
 'Start with an ſ: 2-1',
 'Start with an ʒ: 2-2',
 'Start with an ß: 2+0']

Python

All eight features: <lang python># -*- coding: utf-8 -*-

  1. Not Python 3.x (Can't compare str and int)


from itertools import groupby from unicodedata import decomposition, name from pprint import pprint as pp

commonleaders = ['the'] # lowercase leading words to ignore replacements = {u'ß': 'ss', # Map single char to replacement string

               u'ſ': 's',
               u'ʒ': 's',
               }

hexdigits = set('0123456789abcdef') decdigits = set('0123456789') # Don't use str.isnumeric

def splitchar(c):

   ' De-ligature. De-accent a char'
   de = decomposition(c)
   if de:
       # Just the words that are also hex numbers
       de = [d for d in de.split()
                 if all(c.lower()
                        in hexdigits for c in d)]
       n = name(c, c).upper()
       # (Gosh it's onerous)
       if len(de)> 1 and 'PRECEDE' in n:
           # E.g. ʼn  LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
           de[1], de[0] = de[0], de[1]
       tmp = [ unichr(int(k, 16)) for k in de]
       base, others = tmp[0], tmp[1:]
       if 'LIGATURE' in n:
           # Assume two character ligature
           base += others.pop(0)
   else:
       base = c
   return base
   

def sortkeygen(s):

   Generate 'natural' sort key for s
   Doctests:
       >>> sortkeygen('  some extra    spaces  ')
       [u'some extra spaces']
       >>> sortkeygen('CasE InseNsItIve')
       [u'case insensitive']
       >>> sortkeygen('The Wind in the Willows')
       [u'wind in the willows']
       >>> sortkeygen(u'\462 ligature')
       [u'ij ligature']
       >>> sortkeygen(u'\335\375 upper/lower case Y with acute accent')
       [u'yy upper/lower case y with acute accent']
       >>> sortkeygen('foo9.txt')
       [u'foo', 9, u'.txt']
       >>> sortkeygen('x9y99')
       [u'x', 9, u'y', 99]
   
   # Ignore leading and trailing spaces
   s = unicode(s).strip()
   # All space types are equivalent
   s = ' '.join(s.split())
   # case insentsitive
   s = s.lower()
   # Title
   words = s.split()
   if len(words) > 1 and words[0] in commonleaders:
       s = ' '.join( words[1:])
   # accent and ligatures
   s = .join(splitchar(c) for c in s)
   # Replacements (single char replaced by one or more)
   s = .join( replacements.get(ch, ch) for ch in s )
   # Numeric sections as numerics
   s = [ int("".join(g)) if isinteger else "".join(g)
         for isinteger,g in groupby(s, lambda x: x in decdigits)]
   return s

def naturalsort(items):

    Naturally sort a series of strings
   Doctests:
       >>> naturalsort(['The Wind in the Willows','The 40th step more',
                        'The 39 steps', 'Wanda'])
       ['The 39 steps', 'The 40th step more', 'Wanda', 'The Wind in the Willows']
   
   return sorted(items, key=sortkeygen)

if __name__ == '__main__':

   import string
   
   ns = naturalsort
   print '\n# Ignoring leading spaces'
   txt = ['%signore leading spaces: 2%+i' % (' '*i, i-2) for i in range(4)]
   print 'Text strings:'; pp(txt)
   print 'Normally sorted :'; pp(sorted(txt))
   print 'Naturally sorted:'; pp(ns(txt))
   print '\n# Ignoring multiple adjacent spaces (m.a.s)'
   txt = ['ignore m.a.s%s spaces: 2%+i' % (' '*i, i-2) for i in range(4)]
   print 'Text strings:'; pp(txt)
   print 'Normally sorted :'; pp(sorted(txt))
   print 'Naturally sorted:'; pp(ns(txt))
   print '\n# Equivalent whitespace characters'
   txt = ['Equiv.%sspaces: 3%+i' % (ch, i-3)
          for i,ch in enumerate(reversed(string.whitespace))]
   print 'Text strings:'; pp(txt)
   print 'Normally sorted :'; pp(sorted(txt))
   print 'Naturally sorted:'; pp(ns(txt))
   print '\n# Case Indepenent sort'
   s = 'CASE INDEPENENT'
   txt = [s[:i].lower() + s[i:] + ': 3%+i' % (i-3) for i in range(1,5)]
   print 'Text strings:'; pp(txt)
   print 'Normally sorted :'; pp(sorted(txt))
   print 'Naturally sorted:'; pp(ns(txt))
   print '\n# Numeric fields as numerics'
   txt = ['foo100bar99baz0.txt', 'foo100bar10baz0.txt',
          'foo1000bar99baz10.txt', 'foo1000bar99baz9.txt']
   print 'Text strings:'; pp(txt)
   print 'Normally sorted :'; pp(sorted(txt))
   print 'Naturally sorted:'; pp(ns(txt))
   print '\n# Title sorts'
   txt = ['The Wind in the Willows','The 40th step more',
                        'The 39 steps', 'Wanda']
   print 'Text strings:'; pp(txt)
   print 'Normally sorted :'; pp(sorted(txt))
   print 'Naturally sorted:'; pp(ns(txt))
   print '\n# Equivalent accented characters (and case)'
   txt = ['Equiv. %s accents: 2%+i' % (ch, i-2)
          for i,ch in enumerate(u'\xfd\xddyY')]
   print 'Text strings:'; pp(txt)
   print 'Normally sorted :'; pp(sorted(txt))
   print 'Naturally sorted:'; pp(ns(txt))
   print '\n# Separated ligatures'
   txt = [u'\462 ligatured ij', 'no ligature',]
   print 'Text strings:'; pp(txt)
   print 'Normally sorted :'; pp(sorted(txt))
   print 'Naturally sorted:'; pp(ns(txt))
   
   print '\n# Character replacements'
   s = u'ʒſßs' # u'\u0292\u017f\xdfs'
   txt = ['Start with an %s: 2%+i' % (ch, i-2)
          for i,ch in enumerate(s)]
   print 'Text strings:'; pp(txt)
   print 'Normally sorted :'; print '\n'.join(sorted(txt))
   print 'Naturally sorted:'; print '\n'.join(ns(txt))</lang>

Sample Python output

# Ignoring leading spaces
Text strings:
['ignore leading spaces: 2-2',
 ' ignore leading spaces: 2-1',
 '  ignore leading spaces: 2+0',
 '   ignore leading spaces: 2+1']
Normally sorted :
['   ignore leading spaces: 2+1',
 '  ignore leading spaces: 2+0',
 ' ignore leading spaces: 2-1',
 'ignore leading spaces: 2-2']
Naturally sorted:
['  ignore leading spaces: 2+0',
 '   ignore leading spaces: 2+1',
 ' ignore leading spaces: 2-1',
 'ignore leading spaces: 2-2']

# Ignoring multiple adjacent spaces (m.a.s)
Text strings:
['ignore m.a.s spaces: 2-2',
 'ignore m.a.s  spaces: 2-1',
 'ignore m.a.s   spaces: 2+0',
 'ignore m.a.s    spaces: 2+1']
Normally sorted :
['ignore m.a.s    spaces: 2+1',
 'ignore m.a.s   spaces: 2+0',
 'ignore m.a.s  spaces: 2-1',
 'ignore m.a.s spaces: 2-2']
Naturally sorted:
['ignore m.a.s   spaces: 2+0',
 'ignore m.a.s    spaces: 2+1',
 'ignore m.a.s  spaces: 2-1',
 'ignore m.a.s spaces: 2-2']

# Equivalent whitespace characters
Text strings:
['Equiv. spaces: 3-3',
 'Equiv.\rspaces: 3-2',
 'Equiv.\x0cspaces: 3-1',
 'Equiv.\x0bspaces: 3+0',
 'Equiv.\nspaces: 3+1',
 'Equiv.\tspaces: 3+2']
Normally sorted :
['Equiv.\tspaces: 3+2',
 'Equiv.\nspaces: 3+1',
 'Equiv.\x0bspaces: 3+0',
 'Equiv.\x0cspaces: 3-1',
 'Equiv.\rspaces: 3-2',
 'Equiv. spaces: 3-3']
Naturally sorted:
['Equiv.\x0bspaces: 3+0',
 'Equiv.\nspaces: 3+1',
 'Equiv.\tspaces: 3+2',
 'Equiv.\x0cspaces: 3-1',
 'Equiv.\rspaces: 3-2',
 'Equiv. spaces: 3-3']

# Case Indepenent sort
Text strings:
['cASE INDEPENENT: 3-2',
 'caSE INDEPENENT: 3-1',
 'casE INDEPENENT: 3+0',
 'case INDEPENENT: 3+1']
Normally sorted :
['cASE INDEPENENT: 3-2',
 'caSE INDEPENENT: 3-1',
 'casE INDEPENENT: 3+0',
 'case INDEPENENT: 3+1']
Naturally sorted:
['casE INDEPENENT: 3+0',
 'case INDEPENENT: 3+1',
 'caSE INDEPENENT: 3-1',
 'cASE INDEPENENT: 3-2']

# Numeric fields as numerics
Text strings:
['foo100bar99baz0.txt',
 'foo100bar10baz0.txt',
 'foo1000bar99baz10.txt',
 'foo1000bar99baz9.txt']
Normally sorted :
['foo1000bar99baz10.txt',
 'foo1000bar99baz9.txt',
 'foo100bar10baz0.txt',
 'foo100bar99baz0.txt']
Naturally sorted:
['foo100bar10baz0.txt',
 'foo100bar99baz0.txt',
 'foo1000bar99baz9.txt',
 'foo1000bar99baz10.txt']

# Title sorts
Text strings:
['The Wind in the Willows', 'The 40th step more', 'The 39 steps', 'Wanda']
Normally sorted :
['The 39 steps', 'The 40th step more', 'The Wind in the Willows', 'Wanda']
Naturally sorted:
['The 39 steps', 'The 40th step more', 'Wanda', 'The Wind in the Willows']

# Equivalent accented characters (and case)
Text strings:
[u'Equiv. \xfd accents: 2-2',
 u'Equiv. \xdd accents: 2-1',
 u'Equiv. y accents: 2+0',
 u'Equiv. Y accents: 2+1']
Normally sorted :
[u'Equiv. Y accents: 2+1',
 u'Equiv. y accents: 2+0',
 u'Equiv. \xdd accents: 2-1',
 u'Equiv. \xfd accents: 2-2']
Naturally sorted:
[u'Equiv. y accents: 2+0',
 u'Equiv. Y accents: 2+1',
 u'Equiv. \xdd accents: 2-1',
 u'Equiv. \xfd accents: 2-2']

# Separated ligatures
Text strings:
[u'\u0132 ligatured ij', 'no ligature']
Normally sorted :
['no ligature', u'\u0132 ligatured ij']
Naturally sorted:
[u'\u0132 ligatured ij', 'no ligature']

# Character replacements
Text strings:
[u'Start with an \u0292: 2-2',
 u'Start with an \u017f: 2-1',
 u'Start with an \xdf: 2+0',
 u'Start with an s: 2+1']
Normally sorted :
Start with an s: 2+1
Start with an ß: 2+0
Start with an ſ: 2-1
Start with an ʒ: 2-2
Naturally sorted:
Start with an s: 2+1
Start with an ſ: 2-1
Start with an ʒ: 2-2
Start with an ß: 2+0

Racket

Implements 1-4 (but only normalize spaces -- don't ignore spaces at the beginning/end, easy to implement, but sounds wrong).

<lang racket>

  1. lang racket

(define (natural-sort l)

 (define (list<? l1 l2)
   (cond [(null? l2) #f]
         [(null? l1) #t]
         [(number? (car l1)) (cond [(< (car l1) (car l2)) #t]
                                   [(< (car l2) (car l1)) #f]
                                   [else (list<? (cdr l1) (cdr l2))])]
         [(string? (car l1)) (cond [(string<? (car l1) (car l2)) #t]
                                   [(string<? (car l2) (car l1)) #f]
                                   [else (list<? (cdr l1) (cdr l2))])]))
 (define (->keys s)
   (define s* (string-normalize-spaces (string-foldcase s)))
   (for/list ([x (regexp-match* #px"\\d+" s* #:gap-select? #t)]
              [i (in-naturals)])
     (if (odd? i) (string->number x) x)))
 (sort l list<? #:key ->keys #:cache-keys? #t))

(natural-sort

(shuffle '("foo9.txt" "foo10.txt" "x9y99" "x9y100" "x10y0" "x  z" "x y")))
=> '("foo9.txt" "foo10.txt" "x9y99" "x9y100" "x10y0" "x y" "x z")

</lang>

Ruby

Requirements 1,2,3 and 5 are met in one line of code: <lang ruby>ar.sort_by{|str| str.downcase.gsub(/\Athe |\Aa |\Aan /, "").lstrip.gsub(/\s+/, " ")}</lang> Almost all of the code below is handling requirement 4. The problem is that Ruby will happily sort ["a",1] against ["a",2] or even ["b"], but it does not know how to handle [1, "a"] against ["a", 2] and raises an ArgumentError. The code below does not define a new sort method, it defines a new class which is sortable by the existing method (falling back on string comparison). <lang ruby>class NatSortString

 include Comparable
 attr_reader :scrubbed, :ints_and_strings, :i_s_pattern
 def initialize(str)
   @str = str
   @scrubbed = str.downcase.gsub(/\Athe |\Aa |\Aan /, "").lstrip.gsub(/\s+/, " ")
   @ints_and_strings = @scrubbed.scan(/\d+|\D+/).map{|s| s =~ /\d/ ? s.to_i : s}
   @i_s_pattern = @ints_and_strings.map{|el| el.is_a?(Integer) ? :i : :s}.join
 end
 def <=> (other)
   if i_s_pattern.start_with?(other.i_s_pattern) or other.i_s_pattern.start_with?(i_s_pattern) then
     ints_and_strings <=> other.ints_and_strings
   else
     scrubbed <=> other.scrubbed
   end
 end
 
 def to_s
   @str.dup
 end

end </lang> Demo: <lang ruby>tests =

 {"Ignoring leading spaces" =>
 [ "ignore leading spaces: 2-2 ",  " ignore leading spaces: 2-1 ",  "  ignore leading spaces: 2+0 ",  "   ignore leading spaces: 2+1 "],
 "Ignoring multiple adjacent spaces" =>
 [ "ignore m.a.s spaces: 2-2 ",  "ignore m.a.s  spaces: 2-1 ",  "ignore m.a.s   spaces: 2+0 ",  "ignore m.a.s    spaces: 2+1 "],
 "Equivalent whitespace characters" =>
 ["Equiv. spaces: 3-3", "Equiv.\rspaces: 3-2", "Equiv.\x0cspaces: 3-1", "Equiv.\x0bspaces: 3+0", "Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2"],
 "Case Indepenent sort" =>
 [ "cASE INDEPENENT: 3-2 ",  "caSE INDEPENENT: 3-1 ",  "casE INDEPENENT: 3+0 ",  "case INDEPENENT: 3+1 "],
 "Numeric fields as numerics" =>
 [ "foo100bar99baz0.txt ",  "foo100bar10baz0.txt ",  "foo1000bar99baz10.txt ",  "foo1000bar99baz9.txt "],
 "Title sorts" =>
 [ "The Wind in the Willows ",  "The 40th step more ",  "The 39 steps ",  "Wanda "]}

tests.each do |title, ar|

 nat_sorts = ar.map{|s| NatSortString.new(s)}
 puts [title,"--input--", ar, "--normal sort--", ar.sort, "--natural sort--", nat_sorts.sort, "\n"] 

end </lang>

Output:
Ignoring leading spaces
--input--
ignore leading spaces: 2-2 
 ignore leading spaces: 2-1 
  ignore leading spaces: 2+0 
   ignore leading spaces: 2+1 
--normal sort--
   ignore leading spaces: 2+1 
  ignore leading spaces: 2+0 
 ignore leading spaces: 2-1 
ignore leading spaces: 2-2 
--natural sort--
  ignore leading spaces: 2+0 
   ignore leading spaces: 2+1 
 ignore leading spaces: 2-1 
ignore leading spaces: 2-2 

Ignoring multiple adjacent spaces
--input--
ignore m.a.s spaces: 2-2 
ignore m.a.s  spaces: 2-1 
ignore m.a.s   spaces: 2+0 
ignore m.a.s    spaces: 2+1 
--normal sort--
ignore m.a.s    spaces: 2+1 
ignore m.a.s   spaces: 2+0 
ignore m.a.s  spaces: 2-1 
ignore m.a.s spaces: 2-2 
--natural sort--
ignore m.a.s   spaces: 2+0 
ignore m.a.s    spaces: 2+1 
ignore m.a.s  spaces: 2-1 
ignore m.a.s spaces: 2-2 

Equivalent whitespace characters
--input--
Equiv. spaces: 3-3
spaces: 3-2
Equiv.
      spaces: 3-1
Equiv.
      spaces: 3+0
Equiv.
spaces: 3+1
Equiv.	spaces: 3+2
--normal sort--
Equiv.	spaces: 3+2
Equiv.
spaces: 3+1
Equiv.
      spaces: 3+0
Equiv.
      spaces: 3-1
spaces: 3-2
Equiv. spaces: 3-3
--natural sort--
Equiv.
      spaces: 3+0
Equiv.
spaces: 3+1
Equiv.	spaces: 3+2
Equiv.
      spaces: 3-1
spaces: 3-2
Equiv. spaces: 3-3

Case Indepenent sort
--input--
cASE INDEPENENT: 3-2 
caSE INDEPENENT: 3-1 
casE INDEPENENT: 3+0 
case INDEPENENT: 3+1 
--normal sort--
cASE INDEPENENT: 3-2 
caSE INDEPENENT: 3-1 
casE INDEPENENT: 3+0 
case INDEPENENT: 3+1 
--natural sort--
casE INDEPENENT: 3+0 
case INDEPENENT: 3+1 
caSE INDEPENENT: 3-1 
cASE INDEPENENT: 3-2 

Numeric fields as numerics
--input--
foo100bar99baz0.txt 
foo100bar10baz0.txt 
foo1000bar99baz10.txt 
foo1000bar99baz9.txt 
--normal sort--
foo1000bar99baz10.txt 
foo1000bar99baz9.txt 
foo100bar10baz0.txt 
foo100bar99baz0.txt 
--natural sort--
foo100bar10baz0.txt 
foo100bar99baz0.txt 
foo1000bar99baz9.txt 
foo1000bar99baz10.txt 

Title sorts
--input--
The Wind in the Willows 
The 40th step more 
The 39 steps 
Wanda 
--normal sort--
The 39 steps 
The 40th step more 
The Wind in the Willows 
Wanda 
--natural sort--
The 39 steps 
The 40th step more 
Wanda 
The Wind in the Willows 

Scala

All 8: <lang Scala>object NaturalSorting {

 implicit object ArrayOrdering extends Ordering[Array[String]] { // 4
   val INT = "([0-9]+)".r
   def compare(a: Array[String], b: Array[String]) = {
     val l = Math.min(a.length, b.length)
     (0 until l).prefixLength(i => a(i) equals b(i)) match {
       case i if i == l => Math.signum(b.length - a.length).toInt
       case i => (a(i), b(i)) match {
         case (INT(c), INT(d)) => Math.signum(c.toInt - d.toInt).toInt
         case (c, d) => c compareTo d
       }
     }
   }
 }
 def natural(s: String) = {
   val replacements = Map('\u00df' -> "ss", '\u017f' -> "s", '\u0292' -> "s").withDefault(s => s.toString) // 8
   import java.text.Normalizer
   Normalizer.normalize(Normalizer.normalize(
     s.trim.toLowerCase, // 1.1, 1.2, 3
     Normalizer.Form.NFKC), // 7
     Normalizer.Form.NFD).replaceAll("[\\p{InCombiningDiacriticalMarks}]", "") // 6
    .replaceAll("^(the|a|an) ", "") // 5
    .flatMap(replacements.apply) // 8
    .split(s"\\s+|(?=[0-9])(?<=[^0-9])|(?=[^0-9])(?<=[0-9])") // 1.3, 2 and 4
 }

}

object NaturalSortingTest extends App {

 import NaturalSorting._
 val tests = List(
   ("1 Ignoring leading spaces", List("ignore leading spaces: 2-2", " ignore leading spaces: 2-1", "  ignore leading spaces: 2+0", "   ignore leading spaces: 2+1"), List("  ignore leading spaces: 2+0", "   ignore leading spaces: 2+1", " ignore leading spaces: 2-1", "ignore leading spaces: 2-2")),
   ("1 Ignoring multiple adjacent spaces (m.a.s)", List("ignore m.a.s spaces: 2-2", "ignore m.a.s  spaces: 2-1", "ignore m.a.s   spaces: 2+0", "ignore m.a.s  spaces: 2+1"), List("ignore m.a.s   spaces: 2+0", "ignore m.a.s  spaces: 2+1", "ignore m.a.s  spaces: 2-1", "ignore m.a.s spaces: 2-2")),
   ("2 Equivalent whitespace characters", List("Equiv. spaces: 3-3", "Equiv.\rspaces: 3-2", "Equiv.\u000cspaces: 3-1", "Equiv.\u000bspaces: 3+0", "Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2"), List("Equiv.\u000bspaces: 3+0", "Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2", "Equiv.\u000cspaces: 3-1", "Equiv.\rspaces: 3-2", "Equiv. spaces: 3-3")),
   ("3 Case Independent sort", List("cASE INDEPENENT: 3-2", "caSE INDEPENENT: 3-1", "casE INDEPENENT: 3+0", "case INDEPENENT: 3+1"), List("casE INDEPENENT: 3+0", "case INDEPENENT: 3+1", "caSE INDEPENENT: 3-1", "cASE INDEPENENT: 3-2")),
   ("4 Numeric fields as numerics", List("foo100bar99baz0.txt", "foo100bar10baz0.txt", "foo1000bar99baz10.txt", "foo1000bar99baz9.txt"), List("foo100bar10baz0.txt", "foo100bar99baz0.txt", "foo1000bar99baz9.txt", "foo1000bar99baz10.txt")),
   ("5 Title sorts", List("The Wind in the Willows", "The 40th step more", "The 39 steps", "Wanda"), List("The 39 steps", "The 40th step more", "Wanda", "The Wind in the Willows")),
   ("6 Equivalent accented characters (and case)", List("Equiv. \u00fd accents: 2-2", "Equiv. \u00dd accents: 2-1", "Equiv. y accents: 2+0", "Equiv. Y accents: 2+1"), List("Equiv. y accents: 2+0", "Equiv. Y accents: 2+1", "Equiv. \u00dd accents: 2-1", "Equiv. \u00fd accents: 2-2")),
   ("7 Separated ligatures", List("\u0132 ligatured ij", "no ligature"), List("\u0132 ligatured ij", "no ligature")),
   ("8 Character replacements", List("Start with an \u0292: 2-2", "Start with an \u017f: 2-1", "Start with an \u00df: 2+0", "Start with an s: 2+1"), List("Start with an s: 2+1", "Start with an \u017f: 2-1", "Start with an \u0292: 2-2", "Start with an \u00df: 2+0"))
 )
 val width = tests.flatMap(_._2).map(_.length).max
 assert(tests.forall{case (title, input, expected) =>
   val result = input.sortBy(natural)
   val okay = result == expected
   val label = if (okay) "pass" else "fail"
   println(s"$label: $title".toUpperCase)
   input.zip(result).foreach{case (a, b) => println(s"  ${a.padTo(width, ' ')}  |  ${b.padTo(width, ' ')}")}
   okay
 })

}</lang> Output:

PASS: 1 IGNORING LEADING SPACES
  ignore leading spaces: 2-2     |    ignore leading spaces: 2+0
   ignore leading spaces: 2-1    |     ignore leading spaces: 2+1
    ignore leading spaces: 2+0   |   ignore leading spaces: 2-1
     ignore leading spaces: 2+1  |  ignore leading spaces: 2-2
PASS: 1 IGNORING MULTIPLE ADJACENT SPACES (M.A.S)
  ignore m.a.s spaces: 2-2       |  ignore m.a.s   spaces: 2+0
  ignore m.a.s  spaces: 2-1      |  ignore m.a.s  spaces: 2+1
  ignore m.a.s   spaces: 2+0     |  ignore m.a.s  spaces: 2-1
  ignore m.a.s  spaces: 2+1      |  ignore m.a.s spaces: 2-2
PASS: 2 EQUIVALENT WHITESPACE CHARACTERS
  Equiv. spaces: 3-3             |  Equiv.spaces: 3+0
spaces: 3-2             |  Equiv.
spaces: 3+1
  Equiv.spaces: 3-1             |  Equiv.       spaces: 3+2
  Equiv.spaces: 3+0             |  Equiv.spaces: 3-1
  Equiv.
spaces: 3-2             |  Equiv.
  Equiv.        spaces: 3+2             |  Equiv. spaces: 3-3
PASS: 3 CASE INDEPENDENT SORT
  cASE INDEPENENT: 3-2           |  casE INDEPENENT: 3+0
  caSE INDEPENENT: 3-1           |  case INDEPENENT: 3+1
  casE INDEPENENT: 3+0           |  caSE INDEPENENT: 3-1
  case INDEPENENT: 3+1           |  cASE INDEPENENT: 3-2
PASS: 4 NUMERIC FIELDS AS NUMERICS
  foo100bar99baz0.txt            |  foo100bar10baz0.txt
  foo100bar10baz0.txt            |  foo100bar99baz0.txt
  foo1000bar99baz10.txt          |  foo1000bar99baz9.txt
  foo1000bar99baz9.txt           |  foo1000bar99baz10.txt
PASS: 5 TITLE SORTS
  The Wind in the Willows        |  The 39 steps
  The 40th step more             |  The 40th step more
  The 39 steps                   |  Wanda
  Wanda                          |  The Wind in the Willows
PASS: 6 EQUIVALENT ACCENTED CHARACTERS (AND CASE)
  Equiv. ý accents: 2-2          |  Equiv. y accents: 2+0
  Equiv. Ý accents: 2-1          |  Equiv. Y accents: 2+1
  Equiv. y accents: 2+0          |  Equiv. Ý accents: 2-1
  Equiv. Y accents: 2+1          |  Equiv. ý accents: 2-2
PASS: 7 SEPARATED LIGATURES
  IJ ligatured ij                 |  IJ ligatured ij
  no ligature                    |  no ligature
PASS: 8 CHARACTER REPLACEMENTS
  Start with an ʒ: 2-2           |  Start with an s: 2+1
  Start with an ſ: 2-1           |  Start with an ſ: 2-1
  Start with an ß: 2+0           |  Start with an ʒ: 2-2
  Start with an s: 2+1           |  Start with an ß: 2+0

Tcl

Tcl supports two methods of doing sorting by non-natural keys in the lsort command: the -command option allows the specification of code that makes the ordering decision for a pair of values, but instead the code below demonstrates sorting through the use of collation keys, strings that when sorted in their normal order result in the natural order being used. (These are handled through the use of the -indices option which makes it easy to generate a sorted original list without any need to build compound intermediate tuples.)

Note also that Tcl supports case-insensitive sorting and “treat digit sequences as numbers” as native sorting options. (The latter is particularly useful for handling filenames.) <lang tcl>package require Tcl 8.5

proc sortWithCollationKey {keyBuilder list} {

   if {![llength $list]} return
   foreach value $list {

lappend toSort [{*}$keyBuilder $value]

   }
   foreach idx [lsort -indices $toSort] {

lappend result [lindex $list $idx]

   }
   return $result

} proc normalizeSpaces {str} {

   regsub -all {[ ]+} [string trim $str " "] " "

} proc equivalentWhitespace {str} {

   regsub -all {\s} $str " "

}

proc show {description sorter strings} {

   puts "Input:\n\t[join $strings \n\t]"
   set sorted [lsort $strings]
   puts "Normally sorted:\n\t[join $sorted \n\t]"
   set sorted [{*}$sorter $strings]
   puts "Naturally sorted with ${description}:\n\t[join $sorted \n\t]"

}

  1. Two demonstrations of the space normalizer

show "normalized spaces" {sortWithCollationKey normalizeSpaces} {

   {ignore leading spaces: 2-2}
   { ignore leading spaces: 2-1}
   {  ignore leading spaces: 2+0}
   {   ignore leading spaces: 2+1}}

show "normalized spaces" {sortWithCollationKey normalizeSpaces} {

   {ignore m.a.s spaces: 2-2}
   {ignore m.a.s  spaces: 2-1}
   {ignore m.a.s   spaces: 2+0}
   {ignore m.a.s    spaces: 2+1}}
  1. Use a collation key that maps all whitespace to spaces

show "all whitespace equivalent" {sortWithCollationKey equivalentWhitespace} {

   "Equiv. spaces: 3-3"
   "Equiv.\rspaces: 3-2"
   "Equiv.\u000cspaces: 3-1"
   "Equiv.\u000bspaces: 3+0"
   "Equiv.\nspaces: 3+1"
   "Equiv.\tspaces: 3+2"}
  1. These are built-in modes

show "(built-in) case insensitivity" {lsort -nocase} {

   {cASE INDEPENENT: 3-2}
   {caSE INDEPENENT: 3-1}
   {casE INDEPENENT: 3+0}
   {case INDEPENENT: 3+1}}

show "digit sequences as numbers" {lsort -dictionary} {

   foo100bar99baz0.txt
   foo100bar10baz0.txt
   foo1000bar99baz10.txt
   foo1000bar99baz9.txt}</lang>

Output:

Input:
	ignore leading spaces: 2-2
	 ignore leading spaces: 2-1
	  ignore leading spaces: 2+0
	   ignore leading spaces: 2+1
Normally sorted:
	   ignore leading spaces: 2+1
	  ignore leading spaces: 2+0
	 ignore leading spaces: 2-1
	ignore leading spaces: 2-2
Naturally sorted with normalized spaces:
	  ignore leading spaces: 2+0
	   ignore leading spaces: 2+1
	 ignore leading spaces: 2-1
	ignore leading spaces: 2-2
Input:
	ignore m.a.s spaces: 2-2
	ignore m.a.s  spaces: 2-1
	ignore m.a.s   spaces: 2+0
	ignore m.a.s    spaces: 2+1
Normally sorted:
	ignore m.a.s    spaces: 2+1
	ignore m.a.s   spaces: 2+0
	ignore m.a.s  spaces: 2-1
	ignore m.a.s spaces: 2-2
Naturally sorted with normalized spaces:
	ignore m.a.s   spaces: 2+0
	ignore m.a.s    spaces: 2+1
	ignore m.a.s  spaces: 2-1
	ignore m.a.s spaces: 2-2
Input:
	Equiv. spaces: 3-3
spaces: 3-2iv.
	Equiv.
              spaces: 3-1
	Equiv.
              spaces: 3+0
	Equiv.
spaces: 3+1
	Equiv.	spaces: 3+2
Normally sorted:
	Equiv.	spaces: 3+2
	Equiv.
spaces: 3+1
	Equiv.
              spaces: 3+0
	Equiv.
              spaces: 3-1
spaces: 3-2iv.
	Equiv. spaces: 3-3
Naturally sorted with all whitespace equivalent:
	Equiv.
              spaces: 3+0
	Equiv.
spaces: 3+1
	Equiv.	spaces: 3+2
	Equiv.
              spaces: 3-1
spaces: 3-2iv.
	Equiv. spaces: 3-3
Input:
	cASE INDEPENENT: 3-2
	caSE INDEPENENT: 3-1
	casE INDEPENENT: 3+0
	case INDEPENENT: 3+1
Normally sorted:
	cASE INDEPENENT: 3-2
	caSE INDEPENENT: 3-1
	casE INDEPENENT: 3+0
	case INDEPENENT: 3+1
Naturally sorted with (built-in) case insensitivity:
	casE INDEPENENT: 3+0
	case INDEPENENT: 3+1
	caSE INDEPENENT: 3-1
	cASE INDEPENENT: 3-2
Input:
	foo100bar99baz0.txt
	foo100bar10baz0.txt
	foo1000bar99baz10.txt
	foo1000bar99baz9.txt
Normally sorted:
	foo1000bar99baz10.txt
	foo1000bar99baz9.txt
	foo100bar10baz0.txt
	foo100bar99baz0.txt
Naturally sorted with digit sequences as numbers:
	foo100bar10baz0.txt
	foo100bar99baz0.txt
	foo1000bar99baz9.txt
	foo1000bar99baz10.txt

zkl

These are all immutable list sorts. If using mutable lists, mangle a copy and sort that.

First, a tag-mangled-fields sort, use tags to pull from the original list in sorted order function: <lang zkl>fcn dsuSort(x,orig){ // decorate-sort-undecorate sort

  x.enumerate().sort(fcn([(_,a)],[(_,b)]){a<b})
  .apply('wrap([(n,_)]){orig[n]});

}</lang> Now, mangle a copy of the original list to "normalize" it to the task requirements and sort. <lang zkl># Ignoring leading spaces ts1:=T("ignore leading spaces: 2-2", " ignore leading spaces: 2-1",

 "  ignore leading spaces: 2+0", "   ignore leading spaces: 2+1");

dsuSort(ts1.apply("strip"),ts1).println();</lang>

Output:
"  ignore leading spaces: 2+0"
"   ignore leading spaces: 2+1"
" ignore leading spaces: 2-1"
"ignore leading spaces: 2-2"

<lang zkl># Ignoring multiple adjacent spaces (m.a.s) ts2:=T("ignore m.a.s spaces: 2-2", "ignore m.a.s spaces: 2-1",

 "ignore m.a.s   spaces: 2+0", "ignore m.a.s    spaces: 2+1");

dsuSort(ts2.apply('-(" ")),ts2).println();</lang>

Output:
"ignore m.a.s   spaces: 2+0"
"ignore m.a.s    spaces: 2+1"
"ignore m.a.s  spaces: 2-1"
"ignore m.a.s spaces: 2-2"

<lang zkl># Equivalent whitespace characters ts3:=T("Equiv. spaces: 3-3", "Equiv.\rspaces: 3-2", "Equiv.\x0cspaces: 3-1",

      "Equiv.\x0bspaces: 3+0", "Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2");

dsuSort(ts3.apply('-.fp1("\n\r\t\f\b\x0b ")),ts3).println();</lang>

Output:
"Equiv.?spaces: 3+0"
"Equiv.?spaces: 3+1"
"Equiv.	spaces: 3+2"
"Equiv.?spaces: 3-1"
"Equiv.?spaces: 3-2"
"Equiv. spaces: 3-3"

<lang zkl># Case Indepenent sort ts4:=T("cASE INDEPENENT: 3-2", "caSE INDEPENENT: 3-1",

      "casE INDEPENENT: 3+0", "case INDEPENENT: 3+1");

dsuSort(ts4.apply("toLower"),ts4).println();</lang>

Output:
"casE INDEPENENT: 3+0"
"case INDEPENENT: 3+1"
"caSE INDEPENENT: 3-1"
"cASE INDEPENENT: 3-2"

And the nasty one: multiple numeric fields. Each key is blown apart into a list of ints, where each int is consecutive numeric characters. To compare a pair of fields (ie two lists of ints), walk the two in parallel, stopping when a difference is found. Ugly but it works. Another tactic would be to normalize each field to the same length, concat the fields and use that as the mangled key. But that might have issues if the number of fields differ, the former doesn't care. <lang zkl># Numeric fields as numerics fcn fieldize(s){

  s.apply(fcn(c){"1234567890".holds(c) and c or "."}).split(".")
  .filter().apply("toInt");

} fcn fcmp(a,b){ // T(key, T(numeric fields)), eg L(0, L(100,99,0))

  a[1].zip(b[1]).reduce(fcn(_,[(a,b)]){
     if(a==b)return(True); // continue to next field
     return(Void.Stop,a<b);
  },True);

} fcn fsort(list){

  list.apply(fieldize).enumerate().sort(fcmp)
  .apply('wrap([(n,_)]){list[n]});

}</lang> <lang zkl>ts5:=T("foo100bar99baz0.txt", "foo100bar10baz0.txt", "foo1000bar99baz10.txt",

      "foo1000bar99baz9.txt");

fsort(ts5).println();

x:=T("x9y99","foo10.txt","x10y0","foo9.txt","x9y100"); fsort(x).println();</lang>

Output:
"foo100bar10baz0.txt"
"foo100bar99baz0.txt"
"foo1000bar99baz9.txt"
"foo1000bar99baz10.txt"

L("foo9.txt","x9y99","x9y100","x10y0","foo10.txt")