WiktionaryDumps to words

Task

Make a file that can be useful with spell checkers like Ispell and Aspell.

Use the wiktionary dump (input) to create a file equivalent than "/usr/share/dict/spanish" (output). The input file is an XML dump of the Wiktionary that is a bz2'ed file of about 800MB. The output file should be a file similar than "/usr/share/dict/spanish" which contains one word of a given language by line in a simple text file. An example of such a file is available in Ubuntu with the package wspanish.

C

<lang C>#include <stdio.h>

include <stdlib.h>
include <stdbool.h>
include <string.h>
include <unistd.h>

include <expat.h>
include <pcre.h>

ifdef XML_LARGE_SIZE
define XML_FMT_INT_MOD "ll"
else
define XML_FMT_INT_MOD "l"
endif

ifdef XML_UNICODE_WCHAR_T
define XML_FMT_STR "ls"
else
define XML_FMT_STR "s"
endif

void reset_char_data_buffer(); void process_char_data_buffer();

static bool last_tag_is_title; static bool last_tag_is_text;

static pcre *reCompiled; static pcre_extra *pcreExtra;

void start_element(void *data, const char *element, const char **attribute) {

   process_char_data_buffer();
   reset_char_data_buffer();

   if (strcmp("title", element) == 0) {
       last_tag_is_title = true;
   }
   if (strcmp("text", element) == 0) {
       last_tag_is_text = true;
   }

}

void end_element(void *data, const char *el) {

   process_char_data_buffer();
   reset_char_data_buffer();

}

define TITLE_BUF_SIZE (1024 * 8)

static char char_data_buffer[1024 * 64 * 8]; static char title_buffer[TITLE_BUF_SIZE]; static size_t offs; static bool overflow;

void reset_char_data_buffer(void) {

   offs = 0;
   overflow = false;

}

// pastes parts of the node together void char_data(void *userData, const XML_Char *s, int len) {

   if (!overflow) {
       if (len + offs >= sizeof(char_data_buffer)) {
           overflow = true;
           fprintf(stderr, "Warning: buffer overflow\n");
           fflush(stderr);
       } else {
           memcpy(char_data_buffer + offs, s, len);
           offs += len;
       }
   }

}

void try_match();

// if the element is the one we're after void process_char_data_buffer(void) {

   if (offs > 0) {
       char_data_buffer[offs] = '\0';

       if (last_tag_is_title) {
           unsigned int n = (offs+1 > TITLE_BUF_SIZE) ? TITLE_BUF_SIZE : (offs+1);
           memcpy(title_buffer, char_data_buffer, n);
           last_tag_is_title = false;
       }
       if (last_tag_is_text) {
           try_match();
           last_tag_is_text = false;
       }
   }

}

void try_match() {

   int subStrVec[80];
   int subStrVecLen;
   int pcreExecRet;
   subStrVecLen = sizeof(subStrVec) / sizeof(int);

   pcreExecRet = pcre_exec(
           reCompiled, pcreExtra,
           char_data_buffer, strlen(char_data_buffer),
           0, 0,
           subStrVec, subStrVecLen);

   if (pcreExecRet < 0) {
       switch (pcreExecRet) {
           case PCRE_ERROR_NOMATCH      : break;
           case PCRE_ERROR_NULL         : fprintf(stderr, "Something was null\n");                      break;
           case PCRE_ERROR_BADOPTION    : fprintf(stderr, "A bad option was passed\n");                 break;
           case PCRE_ERROR_BADMAGIC     : fprintf(stderr, "Magic number bad (compiled re corrupt?)\n"); break;
           case PCRE_ERROR_UNKNOWN_NODE : fprintf(stderr, "Something kooky in the compiled re\n");      break;
           case PCRE_ERROR_NOMEMORY     : fprintf(stderr, "Ran out of memory\n");                       break;
           default                      : fprintf(stderr, "Unknown error\n");                           break;
       }
   } else {
       puts(title_buffer);  // print the word
   }

}

define BUF_SIZE 1024

int main(int argc, char *argv[]) {

   char buffer[BUF_SIZE];
   int n;

   const char *pcreErrorStr;
   int pcreErrorOffset;
   char *aStrRegex;
   char **aLineToMatch;

   // Using PCRE

   aStrRegex = "(.*)(==French==)(.*)";  // search for French language

   reCompiled = pcre_compile(aStrRegex, PCRE_DOTALL | PCRE_UTF8, &pcreErrorStr, &pcreErrorOffset, NULL);
   if (reCompiled == NULL) {
       fprintf(stderr, "ERROR: Could not compile regex '%s': %s\n", aStrRegex, pcreErrorStr);
       exit(1);
   }

   pcreExtra = pcre_study(reCompiled, 0, &pcreErrorStr);
   if (pcreErrorStr != NULL) {
       fprintf(stderr, "ERROR: Could not study regex '%s': %s\n", aStrRegex, pcreErrorStr);
       exit(1);
   }

   // Using Expat parser

   XML_Parser parser = XML_ParserCreate(NULL);

   XML_SetElementHandler(parser, start_element, end_element);
   XML_SetCharacterDataHandler(parser, char_data);

   reset_char_data_buffer();

   while (1) {
       int done;
       int len;

       len = (int)fread(buffer, 1, BUF_SIZE, stdin);
       if (ferror(stdin)) {
           fprintf(stderr, "Read error\n");
           exit(1);
       }
       done = feof(stdin);

       if (XML_Parse(parser, buffer, len, done) == XML_STATUS_ERROR) {
           fprintf(stderr,
               "Parse error at line %" XML_FMT_INT_MOD "u:\n%" XML_FMT_STR "\n",
               XML_GetCurrentLineNumber(parser),
               XML_ErrorString(XML_GetErrorCode(parser)));
           exit(1);
       }

       if (done) break;
   }

   XML_ParserFree(parser);

   pcre_free(reCompiled);

   if (pcreExtra != NULL) {

ifdef PCRE_CONFIG_JIT

       pcre_free_study(pcreExtra);

else

       pcre_free(pcreExtra);

endif

   return 0;

}</lang>

Output:

$ gcc wikt_to_words.c -o wikt_to_words -lpcre -lexpat
$ wget --quiet https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 -O - | bzcat | \
    ./wikt_to_words
gratis
gratuit
livre
chien
pond
pies
pie
A
connotation
minute
...

Java

<lang java>import org.xml.sax.*; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.SAXException;

import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import javax.xml.parsers.ParserConfigurationException;

import java.util.regex.Pattern; import java.util.regex.Matcher;

class MyHandler extends DefaultHandler {

   private static final String TITLE = "title";
   private static final String TEXT = "text";

   private String lastTag = "";
   private String title = "";

   @Override
   public void characters(char[] ch, int start, int length) throws SAXException {
       String regex = ".*==French==.*";
       Pattern pat = Pattern.compile(regex, Pattern.DOTALL);

       switch (lastTag) {
           case TITLE:
               title = new String(ch, start, length);
               break;
           case TEXT:
               String text = new String(ch, start, length);
               Matcher mat = pat.matcher(text);
               if (mat.matches()) {
                   System.out.println(title);
               }
               break;
       }
   }

   @Override
   public void startElement(String uri, String localName, String qName, Attributes attrs) throws SAXException {
       lastTag = qName;
   }

   @Override
   public void endElement(String uri, String localName, String qName) throws SAXException {
       lastTag = "";
   }

}

public class WiktoWords {

   public static void main(java.lang.String[] args) {
       try {
           SAXParserFactory spFactory = SAXParserFactory.newInstance();
           SAXParser saxParser = spFactory.newSAXParser();
           MyHandler handler = new MyHandler();
           saxParser.parse(new InputSource(System.in), handler);
       } catch(Exception e) {
           System.exit(1);
       }
   }

}</lang>

Output:

$ javac WiktoWords.java
$ wget --quiet https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 -O - | bzcat | \
    java WiktoWords 
gratis
gratuit
livre
chien
pond
pies
pie
A
connotation
minute
...

Julia

Uses Regex and a state variable instead of XML parsing. Default setting prints the first 80 French words found. <lang julia>using CodecBzip2

function getwords(io::IO, output::IO; languagemark = "==French==", maxwords = 80)

   title, txopen, txclose = "<title>", "<text", "</text>"
   got_text_last = false
   wordcount, titleword = 0, ""
   for line in eachline(io)
       if occursin(title, line)
           got_text_last = false
           titleword = (m = match(r"<title>([^<]+)</title>", line)) != nothing ? m[1] : ""
       elseif occursin(txopen, line)
           got_text_last = true
       elseif occursin(languagemark, line)
           if got_text_last && titleword != ""
               println(output, titleword)
               (wordcount += 1) >= maxwords && break
           end
           got_text_last = false
       elseif occursin(txclose, line)
           got_text_last = false
       end
   end

end

const url = "https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2" const urlfile = "wikidump.bz2" stat(urlfile).size == 0 && download(url, urlfile) const stream = Bzip2DecompressorStream(open(urlfile)) getwords(stream, stdout) # or open a file to write to and use its IO handle instead of stdout

</lang>

Output:

gratis
gratuit
livre  
chien  
pond   
pies   
pie    
A
connotation
minute     
trade      
adjective  
adjectival 
substantive
patronage
merchandise
eagle
fa
fable
a-
abaca
abada
abalone
abandon
abattoir
abaxial
abbatial
abdication
abdicative
abdomen
abdominal
abdominales
abduction
aberrance
aberrant
aberration
abhorrent
abhorrer
abime
abject
abjection
abjuration
abjure
abjurer
ablactation
ablation
ablative
able
abluent
ablution
abolition
abominable
abomination
abord
abortive
about
abracadabra
abrase
abrasion
abrasive
abraxas
abreuvoir
abrogation
abrogative
abrupt
on
abscission
abscond
absconder
quiz
nu
lente
été
servant
robot
y
absent
absenter
absolution
absorbable

OCaml

Using the library xmlm:

<lang ocaml>let () =

 let i = Xmlm.make_input ~strip:true (`Channel stdin) in
 let title = ref "" in
 let tag_path = ref [] in
 let push_tag tag =
   tag_path := tag :: !tag_path
 in
 let pop_tag () =
   match !tag_path with [] -> ()
   | _ :: tl -> tag_path := tl
 in
 let last_tag_is tag =
   match !tag_path with [] -> false
   | hd :: _ -> hd = tag
 in
 let reg = Str.regexp_string "==French==" in
 let matches s =
   try let _ = Str.search_forward reg s 0 in true
   with Not_found -> false
 in
 while not (Xmlm.eoi i) do
   match Xmlm.input i with
   | `Dtd dtd -> ()
   | `El_start ((uri, tag_name), attrs) -> push_tag tag_name
   | `El_end -> pop_tag ()
   | `Data s ->
       if last_tag_is "title"
       then title := s;
       if last_tag_is "text"
       then begin
         if matches s
         then print_endline !title
       end
 done</lang>

Output:

wget --quiet https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 -O - | bzcat | \
  ocaml str.cma -I $(ocamlfind query xmlm) xmlm.cma to_words.ml
gratis
gratuit
livre
chien
pond
pies
pie
A
connotation
minute
...

Perl

Translation of: Raku

<lang perl># 20211214 Perl programming solution

use strict; use warnings; use LWP::UserAgent; use Compress::Raw::Bzip2 ;

my $LanguageMark = "==French=="; my $Target = 5; # words my $URL = 'https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2';

my %needles; my $plain = my $tail = ; my $ua = LWP::UserAgent->new; my $bz = new Compress::Raw::Bunzip2({ -Bufsize => 1, -AppendOutput => 0 });

my $res = $ua->request( HTTP::Request->new(GET => $URL),

  sub { # @_ = Data Chunk, HTTP::Response
     foreach (split , $_[0]) {
        my $status = $bz->bzinflate($_, substr($plain, 0)) ;
        last if $status == BZ_STREAM_END or $status != BZ_OK ;
     }

     if ( scalar ( my @haystacks = split "\n", $plain)) {
        $haystacks[0] = $tail . $haystacks[0]; 
        $tail         = $haystacks[$#haystacks];

        my ($title,$got_text_last) = , 0 ; 

        foreach ( @haystacks[0..$#haystacks-1] ) { 
           if ( /<title>(\w+?)<\/title>/ ) {
              ($title,$got_text_last) = $1, 0;
           } elsif ( /<text/ ) {  
              $got_text_last = 1;
           } elsif ( /$LanguageMark/ ) {
              $needles{$title}++ if ( $got_text_last and $title.defined );  
              if ( %needles >= $Target ) { 
                 print "$_\n" for sort keys %needles;
                 exit;  
              }  
              $got_text_last = 0;
           } elsif ( /<\/text>/ ) { $got_text_last = 0 } 
        }
     }
  }

)</lang>

Output:

chien
gratis
gratuit
livre
pond

Phix

Does not rely on wget/bzcat etc. Downloads in 16K or so blocks, unpacks one block at a time in memory, terminates properly when 5 or more words are found.
Tested on Windows, should be fine on Linux as long as you can provide a suitable bz2.so

constant url = "https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2"

include builtins/libcurl.e
include builtins/bzstream.e

bool got_text_last = false
integer wordcount = 0
string titleword = ""

function doline(object line)
    if not string(line) then
        -- (opt: close output file)
        return false
    end if
    integer k = match("<title>", line)
    if k then
        got_text_last = false
        k += length("<title>")
        integer l = match("</title>", line, k)
        titleword = iff(l?line[k..l-1]:"")
    elsif match("<text", line) then
        got_text_last = true
    elsif match("==French==", line) then
        if got_text_last and titleword != "" then
            printf(1,"%s\n", titleword)
            wordcount += 1
            if wordcount >= 5 then
                -- (opt: close output file)
                return false
            end if
        end if
        got_text_last = false
    elsif match("</text>", line) then
        got_text_last = false
    end if
    return true
end function
 
atom tbr = 0    -- Total Bytes Written
string demiline = ""
constant BLOCKSIZE = 8192
atom outbuf = allocate(BLOCKSIZE)

function write_callback(atom pData, integer size, integer nmemb, atom pUserdata)
    integer bytes_written = size*nmemb
    tbr += bytes_written
    set_struct_field(id_bzs,p_bzs,"next_in",pData)
    set_struct_field(id_bzs,p_bzs,"avail_in",bytes_written)
    set_struct_field(id_bzs,p_bzs,"next_out",outbuf)
    set_struct_field(id_bzs,p_bzs,"avail_out",BLOCKSIZE)
    while true do
        integer res = BZ2_bzDecompress(),
                avail_in = get_struct_field(id_bzs,p_bzs,"avail_in"),
                avail_out = get_struct_field(id_bzs,p_bzs,"avail_out")
        if avail_out<BLOCKSIZE then
            string block = demiline & peek({outbuf,BLOCKSIZE-avail_out})
            integer linestart = 1
            for i=1 to length(block) do
                if block[i]='\n' then
                    if not doline(block[linestart..i-1]) then
                        BZ2_bzDecompressEnd()
                        return 0 -- terminate download
                    end if
                    linestart = i+1
                end if
            end for
            demiline = block[linestart..$]
            set_struct_field(id_bzs,p_bzs,"next_out",outbuf)
            set_struct_field(id_bzs,p_bzs,"avail_out",BLOCKSIZE)
        end if
        if res=BZ_STREAM_END then
            BZ2_bzDecompressEnd()
            return 0
        end if
        if res!=BZ_OK then ?9/0 end if
        if avail_in=0 then exit end if
    end while
    return bytes_written
end function
constant write_cb = call_back({'+',routine_id("write_callback")})

atom curl = curl_easy_init()
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb)
curl_easy_setopt(curl, CURLOPT_URL, url)
BZ2_bzDecompressInit()
integer res = curl_easy_perform(curl)
curl_easy_cleanup(curl)
printf(1,"Total downloaded: %s\n",{file_size_k(tbr)})

Output:

gratis
gratuit
livre
chien
pond
Total downloaded: 239.67KB

Raku

I misunderstood the data format and now just copy verbatim from Julia entry the processing logics .. <lang perl6># 20211209 Raku programming solution

use LWP::Simple; use Compress::Bzip2; use IO::Socket::SSL;

my $LanguageMark = "==French=="; my $Target = 5; # words my $URL = 'https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2';

class CustomLWP is LWP::Simple { has $.URL ;

  method CustomRequest {

     my Blob $resp          = Buf.new;
     my $bzip               = Compress::Bzip2::Stream.new;
     my ( $tail, %needles ) = ;

     my ($host, $port, $path) = self.parse_url($.URL)[1..3]; 
     my $sock = IO::Socket::SSL.new: :$host, :$port;

     $sock.print( "GET {$path} HTTP/1.1\r\n" ~ self.stringify_headers( { 
        'Connection'  => 'close', 
        'User-Agent'  => "LWP::Simple/{LWP::Simple::<$VERSION>} " ~
                         "Raku/{$*RAKU.compiler.gist}",
        'Host'        => $host
     } ) ~ "\r\n" ) or die ;       # request string

     while !self.got-header($resp) { $resp ~= $sock.read(2048) }

     my $bzip-stream = supply {
        emit self.parse_response($resp)[2]; # $resp_content @ parent class 
        loop { 
           done if %needles.elems >= $Target ;
           ( my $chunk = $sock.read(4096) ) ?? emit $chunk !! done 
        }
     }

     react {
        whenever $bzip-stream -> $crypt {
           my $plain     = ( [~] $bzip.decompress: $crypt ).decode('utf8-c8');
           my @haystacks = $plain.split: "\n";
           @haystacks[0] = $tail ~ @haystacks[0]; 
           $tail         = @haystacks[*-1];

           my ($title,$got_text_last) = , False ;

           for @haystacks[0..*-2] { 
              if / '<title>' (\w+?) '</title>' / {
                 ($title,$got_text_last) = $0, False;
              } elsif /   '<text'     / {  
                 $got_text_last = True
              } elsif / $LanguageMark / {
                 %needles{$title}++ if ( $got_text_last and $title.Bool );  
                 last if ( %needles.elems >= $Target ) ; 
                 $got_text_last = False;
              } elsif /  '</text>'    / { $got_text_last = False } 
           }
        }
     }
     return %needles.keys[^$Target]
  }

}

my $ua = CustomLWP.new: URL => $URL ;

$ua.CustomRequest>>.say</lang>

Output:

chien
gratuit
gratis
pond
livre