WiktionaryDumps to words
- Task
Make a file that can be useful with spell checkers like Ispell and Aspell.
Use the wiktionary dump (input) to create a file equivalent than "/usr/share/dict/spanish" (output). The input file is an XML dump of the Wiktionary that is a bz2'ed file of about 800MB. The output file should be a file similar than "/usr/share/dict/spanish" which contains one word of a given language by line in a simple text file. An example of such a file is available in Ubuntu with the package wspanish.
C
<lang C>#include <stdio.h>
- include <stdlib.h>
- include <stdbool.h>
- include <string.h>
- include <unistd.h>
- include <expat.h>
- include <pcre.h>
- ifdef XML_LARGE_SIZE
- define XML_FMT_INT_MOD "ll"
- else
- define XML_FMT_INT_MOD "l"
- endif
- ifdef XML_UNICODE_WCHAR_T
- define XML_FMT_STR "ls"
- else
- define XML_FMT_STR "s"
- endif
void reset_char_data_buffer(); void process_char_data_buffer();
static bool last_tag_is_title; static bool last_tag_is_text;
static pcre *reCompiled; static pcre_extra *pcreExtra;
void start_element(void *data, const char *element, const char **attribute) {
process_char_data_buffer(); reset_char_data_buffer();
if (strcmp("title", element) == 0) { last_tag_is_title = true; } if (strcmp("text", element) == 0) { last_tag_is_text = true; }
}
void end_element(void *data, const char *el) {
process_char_data_buffer(); reset_char_data_buffer();
}
- define TITLE_BUF_SIZE (1024 * 8)
static char char_data_buffer[1024 * 64 * 8]; static char title_buffer[TITLE_BUF_SIZE]; static size_t offs; static bool overflow;
void reset_char_data_buffer(void) {
offs = 0; overflow = false;
}
// pastes parts of the node together void char_data(void *userData, const XML_Char *s, int len) {
if (!overflow) { if (len + offs >= sizeof(char_data_buffer)) { overflow = true; fprintf(stderr, "Warning: buffer overflow\n"); fflush(stderr); } else { memcpy(char_data_buffer + offs, s, len); offs += len; } }
}
void try_match();
// if the element is the one we're after void process_char_data_buffer(void) {
if (offs > 0) { char_data_buffer[offs] = '\0';
if (last_tag_is_title) { unsigned int n = (offs+1 > TITLE_BUF_SIZE) ? TITLE_BUF_SIZE : (offs+1); memcpy(title_buffer, char_data_buffer, n); last_tag_is_title = false; } if (last_tag_is_text) { try_match(); last_tag_is_text = false; } }
}
void try_match() {
int subStrVec[80]; int subStrVecLen; int pcreExecRet; subStrVecLen = sizeof(subStrVec) / sizeof(int);
pcreExecRet = pcre_exec( reCompiled, pcreExtra, char_data_buffer, strlen(char_data_buffer), 0, 0, subStrVec, subStrVecLen);
if (pcreExecRet < 0) { switch (pcreExecRet) { case PCRE_ERROR_NOMATCH : break; case PCRE_ERROR_NULL : fprintf(stderr, "Something was null\n"); break; case PCRE_ERROR_BADOPTION : fprintf(stderr, "A bad option was passed\n"); break; case PCRE_ERROR_BADMAGIC : fprintf(stderr, "Magic number bad (compiled re corrupt?)\n"); break; case PCRE_ERROR_UNKNOWN_NODE : fprintf(stderr, "Something kooky in the compiled re\n"); break; case PCRE_ERROR_NOMEMORY : fprintf(stderr, "Ran out of memory\n"); break; default : fprintf(stderr, "Unknown error\n"); break; } } else { puts(title_buffer); // print the word }
}
- define BUF_SIZE 1024
int main(int argc, char *argv[]) {
char buffer[BUF_SIZE]; int n;
const char *pcreErrorStr; int pcreErrorOffset; char *aStrRegex; char **aLineToMatch;
// Using PCRE
aStrRegex = "(.*)(==French==)(.*)"; // search for French language
reCompiled = pcre_compile(aStrRegex, PCRE_DOTALL | PCRE_UTF8, &pcreErrorStr, &pcreErrorOffset, NULL); if (reCompiled == NULL) { fprintf(stderr, "ERROR: Could not compile regex '%s': %s\n", aStrRegex, pcreErrorStr); exit(1); }
pcreExtra = pcre_study(reCompiled, 0, &pcreErrorStr); if (pcreErrorStr != NULL) { fprintf(stderr, "ERROR: Could not study regex '%s': %s\n", aStrRegex, pcreErrorStr); exit(1); }
// Using Expat parser
XML_Parser parser = XML_ParserCreate(NULL);
XML_SetElementHandler(parser, start_element, end_element); XML_SetCharacterDataHandler(parser, char_data);
reset_char_data_buffer();
while (1) { int done; int len;
len = (int)fread(buffer, 1, BUF_SIZE, stdin); if (ferror(stdin)) { fprintf(stderr, "Read error\n"); exit(1); } done = feof(stdin);
if (XML_Parse(parser, buffer, len, done) == XML_STATUS_ERROR) { fprintf(stderr, "Parse error at line %" XML_FMT_INT_MOD "u:\n%" XML_FMT_STR "\n", XML_GetCurrentLineNumber(parser), XML_ErrorString(XML_GetErrorCode(parser))); exit(1); }
if (done) break; }
XML_ParserFree(parser);
pcre_free(reCompiled);
if (pcreExtra != NULL) {
- ifdef PCRE_CONFIG_JIT
pcre_free_study(pcreExtra);
- else
pcre_free(pcreExtra);
- endif
}
return 0;
}</lang>
- Output:
$ gcc wikt_to_words.c -o wikt_to_words -lpcre -lexpat $ wget --quiet https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 -O - | bzcat | \ ./wikt_to_words gratis gratuit livre chien pond pies pie A connotation minute ...
Java
<lang java>import org.xml.sax.*; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.SAXException;
import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import javax.xml.parsers.ParserConfigurationException;
import java.util.regex.Pattern; import java.util.regex.Matcher;
class MyHandler extends DefaultHandler {
private static final String TITLE = "title"; private static final String TEXT = "text";
private String lastTag = ""; private String title = "";
@Override public void characters(char[] ch, int start, int length) throws SAXException { String regex = ".*==French==.*"; Pattern pat = Pattern.compile(regex, Pattern.DOTALL);
switch (lastTag) { case TITLE: title = new String(ch, start, length); break; case TEXT: String text = new String(ch, start, length); Matcher mat = pat.matcher(text); if (mat.matches()) { System.out.println(title); } break; } }
@Override public void startElement(String uri, String localName, String qName, Attributes attrs) throws SAXException { lastTag = qName; }
@Override public void endElement(String uri, String localName, String qName) throws SAXException { lastTag = ""; }
}
public class WiktoWords {
public static void main(java.lang.String[] args) { try { SAXParserFactory spFactory = SAXParserFactory.newInstance(); SAXParser saxParser = spFactory.newSAXParser(); MyHandler handler = new MyHandler(); saxParser.parse(new InputSource(System.in), handler); } catch(Exception e) { System.exit(1); } }
}</lang>
- Output:
$ javac WiktoWords.java $ wget --quiet https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 -O - | bzcat | \ java WiktoWords gratis gratuit livre chien pond pies pie A connotation minute ...
Julia
Uses Regex and a state variable instead of XML parsing. Default setting prints the first 80 French words found. <lang julia>using CodecBzip2
function getwords(io::IO, output::IO; languagemark = "==French==", maxwords = 80)
title, txopen, txclose = "<title>", "<text", "</text>" got_text_last = false wordcount, titleword = 0, "" for line in eachline(io) if occursin(title, line) got_text_last = false titleword = (m = match(r"<title>([^<]+)</title>", line)) != nothing ? m[1] : "" elseif occursin(txopen, line) got_text_last = true elseif occursin(languagemark, line) if got_text_last && titleword != "" println(output, titleword) (wordcount += 1) >= maxwords && break end got_text_last = false elseif occursin(txclose, line) got_text_last = false end end
end
const url = "https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2" const urlfile = "wikidump.bz2" stat(urlfile).size == 0 && download(url, urlfile) const stream = Bzip2DecompressorStream(open(urlfile)) getwords(stream, stdout) # or open a file to write to and use its IO handle instead of stdout
</lang>
- Output:
gratis gratuit livre chien pond pies pie A connotation minute trade adjective adjectival substantive patronage merchandise eagle fa fable a- abaca abada abalone abandon abattoir abaxial abbatial abdication abdicative abdomen abdominal abdominales abduction aberrance aberrant aberration abhorrent abhorrer abime abject abjection abjuration abjure abjurer ablactation ablation ablative able abluent ablution abolition abominable abomination abord abortive about abracadabra abrase abrasion abrasive abraxas abreuvoir abrogation abrogative abrupt on abscission abscond absconder quiz nu lente été servant robot y absent absenter absolution absorbable
OCaml
Using the library xmlm:
<lang ocaml>let () =
let i = Xmlm.make_input ~strip:true (`Channel stdin) in let title = ref "" in let tag_path = ref [] in let push_tag tag = tag_path := tag :: !tag_path in let pop_tag () = match !tag_path with [] -> () | _ :: tl -> tag_path := tl in let last_tag_is tag = match !tag_path with [] -> false | hd :: _ -> hd = tag in let reg = Str.regexp_string "==French==" in let matches s = try let _ = Str.search_forward reg s 0 in true with Not_found -> false in while not (Xmlm.eoi i) do match Xmlm.input i with | `Dtd dtd -> () | `El_start ((uri, tag_name), attrs) -> push_tag tag_name | `El_end -> pop_tag () | `Data s -> if last_tag_is "title" then title := s; if last_tag_is "text" then begin if matches s then print_endline !title end done</lang>
- Output:
wget --quiet https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 -O - | bzcat | \ ocaml str.cma -I $(ocamlfind query xmlm) xmlm.cma to_words.ml gratis gratuit livre chien pond pies pie A connotation minute ...
Perl
<lang perl># 20211214 Perl programming solution
use strict; use warnings; use LWP::UserAgent; use Compress::Raw::Bzip2 ;
my $LanguageMark = "==French=="; my $Target = 5; # words my $URL = 'https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2';
my %needles; my $plain = my $tail = ; my $ua = LWP::UserAgent->new; my $bz = new Compress::Raw::Bunzip2({ -Bufsize => 1, -AppendOutput => 0 });
my $res = $ua->request( HTTP::Request->new(GET => $URL),
sub { # @_ = Data Chunk, HTTP::Response foreach (split , $_[0]) { my $status = $bz->bzinflate($_, substr($plain, 0)) ; last if $status == BZ_STREAM_END or $status != BZ_OK ; }
if ( scalar ( my @haystacks = split "\n", $plain)) { $haystacks[0] = $tail . $haystacks[0]; $tail = $haystacks[$#haystacks]; my ($title,$got_text_last) = , 0 ; foreach ( @haystacks[0..$#haystacks-1] ) { if ( /<title>(\w+?)<\/title>/ ) { ($title,$got_text_last) = $1, 0; } elsif ( /<text/ ) { $got_text_last = 1; } elsif ( /$LanguageMark/ ) { $needles{$title}++ if ( $got_text_last and $title.defined ); if ( %needles >= $Target ) { print "$_\n" for sort keys %needles; exit; } $got_text_last = 0; } elsif ( /<\/text>/ ) { $got_text_last = 0 } } } }
)</lang>
- Output:
chien gratis gratuit livre pond
Phix
Does not rely on wget/bzcat etc. Downloads in 16K or so blocks, unpacks one block at a time in memory, terminates properly when 5 or more words are found.
Tested on Windows, should be fine on Linux as long as you can provide a suitable bz2.so
constant url = "https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2" include builtins/libcurl.e include builtins/bzstream.e bool got_text_last = false integer wordcount = 0 string titleword = "" function doline(object line) if not string(line) then -- (opt: close output file) return false end if integer k = match("<title>", line) if k then got_text_last = false k += length("<title>") integer l = match("</title>", line, k) titleword = iff(l?line[k..l-1]:"") elsif match("<text", line) then got_text_last = true elsif match("==French==", line) then if got_text_last and titleword != "" then printf(1,"%s\n", titleword) wordcount += 1 if wordcount >= 5 then -- (opt: close output file) return false end if end if got_text_last = false elsif match("</text>", line) then got_text_last = false end if return true end function atom tbr = 0 -- Total Bytes Written string demiline = "" constant BLOCKSIZE = 8192 atom outbuf = allocate(BLOCKSIZE) function write_callback(atom pData, integer size, integer nmemb, atom pUserdata) integer bytes_written = size*nmemb tbr += bytes_written set_struct_field(id_bzs,p_bzs,"next_in",pData) set_struct_field(id_bzs,p_bzs,"avail_in",bytes_written) set_struct_field(id_bzs,p_bzs,"next_out",outbuf) set_struct_field(id_bzs,p_bzs,"avail_out",BLOCKSIZE) while true do integer res = BZ2_bzDecompress(), avail_in = get_struct_field(id_bzs,p_bzs,"avail_in"), avail_out = get_struct_field(id_bzs,p_bzs,"avail_out") if avail_out<BLOCKSIZE then string block = demiline & peek({outbuf,BLOCKSIZE-avail_out}) integer linestart = 1 for i=1 to length(block) do if block[i]='\n' then if not doline(block[linestart..i-1]) then BZ2_bzDecompressEnd() return 0 -- terminate download end if linestart = i+1 end if end for demiline = block[linestart..$] set_struct_field(id_bzs,p_bzs,"next_out",outbuf) set_struct_field(id_bzs,p_bzs,"avail_out",BLOCKSIZE) end if if res=BZ_STREAM_END then BZ2_bzDecompressEnd() return 0 end if if res!=BZ_OK then ?9/0 end if if avail_in=0 then exit end if end while return bytes_written end function constant write_cb = call_back({'+',routine_id("write_callback")}) atom curl = curl_easy_init() curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb) curl_easy_setopt(curl, CURLOPT_URL, url) BZ2_bzDecompressInit() integer res = curl_easy_perform(curl) curl_easy_cleanup(curl) printf(1,"Total downloaded: %s\n",{file_size_k(tbr)})
- Output:
gratis gratuit livre chien pond Total downloaded: 239.67KB
Raku
I misunderstood the data format and now just copy verbatim from Julia entry the processing logics .. <lang perl6># 20211209 Raku programming solution
use LWP::Simple; use Compress::Bzip2; use IO::Socket::SSL;
my $LanguageMark = "==French=="; my $Target = 5; # words my $URL = 'https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2';
class CustomLWP is LWP::Simple { has $.URL ;
method CustomRequest {
my Blob $resp = Buf.new; my $bzip = Compress::Bzip2::Stream.new; my ( $tail, %needles ) = ;
my ($host, $port, $path) = self.parse_url($.URL)[1..3]; my $sock = IO::Socket::SSL.new: :$host, :$port;
$sock.print( "GET {$path} HTTP/1.1\r\n" ~ self.stringify_headers( { 'Connection' => 'close', 'User-Agent' => "LWP::Simple/{LWP::Simple::<$VERSION>} " ~ "Raku/{$*RAKU.compiler.gist}", 'Host' => $host } ) ~ "\r\n" ) or die ; # request string
while !self.got-header($resp) { $resp ~= $sock.read(2048) }
my $bzip-stream = supply { emit self.parse_response($resp)[2]; # $resp_content @ parent class loop { done if %needles.elems >= $Target ; ( my $chunk = $sock.read(4096) ) ?? emit $chunk !! done } }
react { whenever $bzip-stream -> $crypt { my $plain = ( [~] $bzip.decompress: $crypt ).decode('utf8-c8'); my @haystacks = $plain.split: "\n"; @haystacks[0] = $tail ~ @haystacks[0]; $tail = @haystacks[*-1];
my ($title,$got_text_last) = , False ;
for @haystacks[0..*-2] { if / '<title>' (\w+?) '</title>' / { ($title,$got_text_last) = $0, False; } elsif / '<text' / { $got_text_last = True } elsif / $LanguageMark / { %needles{$title}++ if ( $got_text_last and $title.Bool ); last if ( %needles.elems >= $Target ) ; $got_text_last = False; } elsif / '</text>' / { $got_text_last = False } } } } return %needles.keys[^$Target] }
}
my $ua = CustomLWP.new: URL => $URL ;
$ua.CustomRequest>>.say</lang>
- Output:
chien gratuit gratis pond livre