WiktionaryDumps to words: Difference between revisions

← Older edit

WiktionaryDumps to words (view source)

Revision as of 09:30, 17 February 2024

13,367 bytes added , 3 months ago

m

→‎{{header|Wren}}: Minor tidy and rerun

PureFox

9,476

edits

Revision as of 23:56, 13 April 2021 (view source) Petelomax (talk \| contribs) m (→‎{{header\|Phix}}: win/lnx note) ← Older edit		Latest revision as of 09:30, 17 February 2024 (view source) PureFox (talk \| contribs) m (→‎{{header\|Wren}}: Minor tidy and rerun)
(12 intermediate revisions by 5 users not shown)
Line 1: {{draft task}} ;NOTE: Please help addressing the issues about this task on the discussion page. If you add another language, be aware that this task may change in the future, and that you will need to update your example. ;Task: Make a file that can be useful with [https://en.wikipedia.org/wiki/Spell_checker spell checkers] like [https://fr.wikipedia.org/wiki/Ispell Ispell] and [https://en.wikipedia.org/wiki/GNU_Aspell Aspell]. Use the [https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 wiktionary dump] (input) to create a file equivalent ~~than~~to [https://manpages.ubuntu.com/manpages/bionic/man5/spanish.5.html "/usr/share/dict/spanish"] (output). The input file is an XML dump of the Wiktionary that is a bz2'ed file of about 800MB. The output file should be a file similar ~~than~~to "/usr/share/dict/spanish", ~~which~~a ~~contains~~simple ~~one~~text ~~word~~file ofeach aline ~~given~~of ~~language~~which byis ~~line~~one word in athe ~~simple~~given ~~text file~~language. An example of such a file is available in Ubuntu with the package '''wspanish'''. =={{header\|C}}== <~~lang~~syntaxhighlight Clang="c">#include <stdio.h> #include <stdlib.h> #include <stdbool.h> Line 207 ⟶ 205: return 0; }</~~lang~~syntaxhighlight> {{out}} Line 231 ⟶ 229: =={{header\|Java}}== <~~lang~~syntaxhighlight lang="java">import org.xml.sax.; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.SAXException; Line 290 ⟶ 288: } } }</~~lang~~syntaxhighlight> {{out}} Line 313 ⟶ 311: =={{header\|Julia}}== Uses Regex and a state variable instead of XML parsing. Default setting prints the first 80 French words found. <~~lang~~syntaxhighlight lang="julia">using CodecBzip2 function getwords(io::IO, output::IO; languagemark = "==French==", maxwords = 80) Line 343 ⟶ 341: getwords(stream, stdout) # or open a file to write to and use its IO handle instead of stdout </~~lang~~syntaxhighlight>{{out}} <pre> gratis Line 425 ⟶ 423: absolution absorbable </pre> =={{header\|OCaml}}== Using the library [http://erratique.ch/software/xmlm xmlm]: <syntaxhighlight lang="ocaml">let () = let i = Xmlm.make_input ~strip:true (`Channel stdin) in let title = ref "" in let tag_path = ref [] in let push_tag tag = tag_path := tag :: !tag_path in let pop_tag () = match !tag_path with [] -> () \| _ :: tl -> tag_path := tl in let last_tag_is tag = match !tag_path with [] -> false \| hd :: _ -> hd = tag in let reg = Str.regexp_string "==French==" in let matches s = try let _ = Str.search_forward reg s 0 in true with Not_found -> false in while not (Xmlm.eoi i) do match Xmlm.input i with \| `Dtd dtd -> () \| `El_start ((uri, tag_name), attrs) -> push_tag tag_name \| `El_end -> pop_tag () \| `Data s -> if last_tag_is "title" then title := s; if last_tag_is "text" then begin if matches s then print_endline !title end done</syntaxhighlight> {{out}} <pre> wget --quiet https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 -O - \| bzcat \| \ ocaml str.cma -I $(ocamlfind query xmlm) xmlm.cma to_words.ml gratis gratuit livre chien pond pies pie A connotation minute ... </pre> =={{header\|Perl}}== {{trans\|Raku}} <syntaxhighlight lang="perl"># 20211214 Perl programming solution use strict; use warnings; use LWP::UserAgent; use Compress::Raw::Bzip2 ; my $LanguageMark = "==French=="; my $Target = 5; # words my $URL = 'https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2'; my %needles; my $plain = my $tail = ''; my $ua = LWP::UserAgent->new; my $bz = new Compress::Raw::Bunzip2({ -Bufsize => 1, -AppendOutput => 0 }); my $res = $ua->request( HTTP::Request->new(GET => $URL), sub { # @_ = Data Chunk, HTTP::Response foreach (split '', $_[0]) { my $status = $bz->bzinflate($_, substr($plain, 0)) ; last if $status == BZ_STREAM_END or $status != BZ_OK ; } if ( scalar ( my @haystacks = split "\n", $plain)) { $haystacks[0] = $tail . $haystacks[0]; $tail = $haystacks[$#haystacks]; my ($title,$got_text_last) = '', 0 ; foreach ( @haystacks[0..$#haystacks-1] ) { if ( /<title>(\w+?)<\/title>/ ) { ($title,$got_text_last) = $1, 0; } elsif ( /<text/ ) { $got_text_last = 1; } elsif ( /$LanguageMark/ ) { $needles{$title}++ if ( $got_text_last and $title.defined ); if ( %needles >= $Target ) { print "$_\n" for sort keys %needles; exit; } $got_text_last = 0; } elsif ( /<\/text>/ ) { $got_text_last = 0 } } } } )</syntaxhighlight> {{out}} <pre> chien gratis gratuit livre pond </pre> =={{header\|Phix}}== Does not ~~reply~~rely on wget/bzcat etc. Downloads in 16K or so blocks, unpacks one block at a time in memory, terminates properly when 5 or more words are found.<br> Tested on Windows, should be fine on Linux as long as you can provide a suitable bz2.so <!--<~~lang~~syntaxhighlight ~~Phix~~lang="phix">(notonline)--> <span style="color: #008080;">constant</span> <span style="color: #000000;">url</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">"https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2"</span> Line 486 ⟶ 598: <span style="color: #000000;">avail_out</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">get_struct_field</span><span style="color: #0000FF;">(</span><span style="color: #000000;">id_bzs</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p_bzs</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"avail_out"</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">if</span> <span style="color: #000000;">avail_out</span><span style="color: #0000FF;"><</span><span style="color: #000000;">BLOCKSIZE</span> <span style="color: #008080;">then</span> <span style="color: #004080;">string</span> <span style="color: #~~008080~~000000;">block</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">demiline</span> <span style="color: #0000FF;">&</span> <span style="color: #7060A8;">peek</span><span style="color: #0000FF;">({</span><span style="color: #000000;">outbuf</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BLOCKSIZE</span><span style="color: #0000FF;">-</span><span style="color: #000000;">avail_out</span><span style="color: #0000FF;">})</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">linestart</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span> <span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #~~008080~~000000;">block</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span> <span style="color: #008080;">if</span> <span style="color: #~~008080~~000000;">block</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]=</span><span style="color: #008000;">'\n'</span> <span style="color: #008080;">then</span> <span style="color: #008080;">if</span> <span style="color: #008080;">not</span> <span style="color: #000000;">doline</span><span style="color: #0000FF;">(</span><span style="color: #~~008080~~000000;">block</span><span style="color: #0000FF;">[</span><span style="color: #000000;">linestart</span><span style="color: #0000FF;">..</span><span style="color: #000000;">i</span><span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">])</span> <span style="color: #008080;">then</span> <span style="color: #000000;">BZ2_bzDecompressEnd</span><span style="color: #0000FF;">()</span> <span style="color: #008080;">return</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- terminate download</span> Line 497 ⟶ 609: <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> <span style="color: #008080;">end</span> <span style="color: #008080;">for</span> <span style="color: #000000;">demiline</span> <span style="color: #0000FF;">=</span> <span style="color: #~~008080~~000000;">block</span><span style="color: #0000FF;">[</span><span style="color: #000000;">linestart</span><span style="color: #0000FF;">..$]</span> <span style="color: #000000;">set_struct_field</span><span style="color: #0000FF;">(</span><span style="color: #000000;">id_bzs</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p_bzs</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"next_out"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">outbuf</span><span style="color: #0000FF;">)</span> <span style="color: #000000;">set_struct_field</span><span style="color: #0000FF;">(</span><span style="color: #000000;">id_bzs</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p_bzs</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"avail_out"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BLOCKSIZE</span><span style="color: #0000FF;">)</span> Line 519 ⟶ 631: <span style="color: #7060A8;">curl_easy_cleanup</span><span style="color: #0000FF;">(</span><span style="color: #000000;">curl</span><span style="color: #0000FF;">)</span> <span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"Total downloaded: %s\n"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">file_size_k</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tbr</span><span style="color: #0000FF;">)})</span> <!--</~~lang~~syntaxhighlight>--> {{out}} <pre> Line 530 ⟶ 642: </pre> =={{header\|~~OCaml~~Raku}}== I misunderstood the data format and now just copy verbatim from Julia entry the processing logics .. <syntaxhighlight lang="raku" line># 20211209 Raku programming solution use LWP::Simple; ~~Using the library [http://erratique.ch/software/xmlm xmlm]:~~ use Compress::Bzip2; use IO::Socket::SSL; my $LanguageMark = "==French=="; ~~<lang ocaml>let () =~~ my $Target = 5; # words ~~let i = Xmlm.make_input ~strip:true (`Channel stdin) in~~ my $URL = 'https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2'; ~~let title = ref "" in~~ ~~let tag_path = ref [] in~~ ~~let push_tag tag =~~ ~~tag_path := tag :: !tag_path~~ in ~~let pop_tag () =~~ ~~match !tag_path with [] -> ()~~ ~~\| _ :: tl -> tag_path := tl~~ in ~~let last_tag_is tag =~~ ~~match !tag_path with [] -> false~~ ~~\| hd :: _ -> hd = tag~~ in ~~let reg = Str.regexp_string "==French==" in~~ ~~let matches s =~~ ~~try let _ = Str.search_forward reg s 0 in true~~ ~~with Not_found -> false~~ in ~~while not (Xmlm.eoi i) do~~ ~~match Xmlm.input i with~~ ~~\| `Dtd dtd -> ()~~ ~~\| `El_start ((uri, tag_name), attrs) -> push_tag tag_name~~ ~~\| `El_end -> pop_tag ()~~ ~~\| `Data s ->~~ ~~if last_tag_is "title"~~ ~~then title := s;~~ ~~if last_tag_is "text"~~ ~~then begin~~ ~~if matches s~~ ~~then print_endline !title~~ ~~end~~ ~~done</lang>~~ class CustomLWP is LWP::Simple { has $.URL ; method CustomRequest { my Blob $resp = Buf.new; my $bzip = Compress::Bzip2::Stream.new; my ( $tail, %needles ) = ''; my ($host, $port, $path) = self.parse_url($.URL)[1..3]; my $sock = IO::Socket::SSL.new: :$host, :$port; $sock.print( "GET {$path} HTTP/1.1\r\n" ~ self.stringify_headers( { 'Connection' => 'close', 'User-Agent' => "LWP::Simple/{LWP::Simple::<$VERSION>} " ~ "Raku/{$RAKU.compiler.gist}", 'Host' => $host } ) ~ "\r\n" ) or die ; # request string while !self.got-header($resp) { $resp ~= $sock.read(2048) } my $bzip-stream = supply { emit self.parse_response($resp)[2]; # $resp_content @ parent class loop { done if %needles.elems >= $Target ; ( my $chunk = $sock.read(4096) ) ?? emit $chunk !! done } } react { whenever $bzip-stream -> $crypt { my $plain = ( [~] $bzip.decompress: $crypt ).decode('utf8-c8'); my @haystacks = $plain.split: "\n"; @haystacks[0] = $tail ~ @haystacks[0]; $tail = @haystacks[-1]; my ($title,$got_text_last) = '', False ; for @haystacks[0..-2] { if / '<title>' (\w+?) '</title>' / { ($title,$got_text_last) = $0, False; } elsif / '<text' / { $got_text_last = True } elsif / $LanguageMark / { %needles{$title}++ if ( $got_text_last and $title.Bool ); last if ( %needles.elems >= $Target ) ; $got_text_last = False; } elsif / '</text>' / { $got_text_last = False } } } } return %needles.keys[^$Target] } } my $ua = CustomLWP.new: URL => $URL ; $ua.CustomRequest>>.say</syntaxhighlight> {{out}} <pre> chien gratuit gratis pond livre </pre> =={{header\|Wren}}== {{trans\|Julia}} {{libheader\|libcurl}} {{libheader\|libbzip2}} {{libheader\|Wren-pattern}} An embedded program so we can use libcurl and libbzip2. Rather than downloading the full 800MB .bz2 file and then decompressing it, we abort the download after receiving no more than the first 512 KB and then decompress that ignoring the resultant BZ_UNEXPECTED_EOF error. This turns out to be enough to find the first 22 French words. <syntaxhighlight lang="wren">/* WiktionaryDumps_to_words.wren / import "./pattern" for Pattern var CURLOPT_URL = 10002 var CURLOPT_FOLLOWLOCATION = 52 var CURLOPT_WRITEFUNCTION = 20011 var CURLOPT_WRITEDATA = 10001 foreign class Buffer { construct new() {} // C will allocate buffer of a suitable size foreign value // returns buffer contents as a string after decompression } foreign class Curl { construct easyInit() {} foreign easySetOpt(opt, param) foreign easyPerform() foreign easyCleanup() } var curl = Curl.easyInit() var getContent = Fn.new { \|url\| var buffer = Buffer.new() curl.easySetOpt(CURLOPT_URL, url) curl.easySetOpt(CURLOPT_FOLLOWLOCATION, 1) curl.easySetOpt(CURLOPT_WRITEFUNCTION, 0) // write function to be supplied by C curl.easySetOpt(CURLOPT_WRITEDATA, buffer) curl.easyPerform() return buffer.value } var url = "https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2" var content = getContent.call(url) curl.easyCleanup() var lines = content.split("\n") var title = "<title>" var txtOpen = "<text" var txtClose = "</text>" var langMark = "==French==" var gotTextLast = false var titleWord = "" var p = Pattern.new("<title>[+1^<]<//title>") for (line in lines) { if (line.indexOf(title) >= 0) { gotTextLast = false var m = p.find(line) titleWord = m ? m.capsText[0] : "" } else if (line.indexOf(txtOpen) >= 0) { gotTextLast = true } else if (line.indexOf(langMark) >= 0) { if (gotTextLast && titleWord != "") System.print(titleWord) gotTextLast = false } else if (line.indexOf(txtClose) >= 0) { gotTextLast = false } }</syntaxhighlight> <br> We now embed this script in the following C program, build and run. <syntaxhighlight lang="c">/ gcc WiktionaryDumps_to_words.c -o WiktionaryDumps_to_words -lcurl -lbz2 -lwren -lm / #include <stdio.h> #include <stdlib.h> #include <string.h> #include <curl/curl.h> #include <bzlib.h> #include "wren.h" struct MemoryStruct { char memory; size_t size; }; const size_t LIMIT = 512 * 1024; /* C <=> Wren interface functions / static size_t WriteMemoryCallback(void contents, size_t size, size_t nmemb, void userp) { size_t realsize = size nmemb; struct MemoryStruct mem = (struct MemoryStruct )userp; size_t size_needed = mem->size + realsize + 1; if (size_needed > LIMIT) return -1; // abort download char ptr = realloc(mem->memory, size_needed); if(!ptr) { / out of memory! / printf("not enough memory (realloc returned NULL)\n"); return 0; } mem->memory = ptr; memcpy(&(mem->memory[mem->size]), contents, realsize); mem->size += realsize; mem->memory[mem->size] = 0; return realsize; } void C_bufferAllocate(WrenVM vm) { struct MemoryStruct ms = (struct MemoryStruct )wrenSetSlotNewForeign(vm, 0, 0, sizeof(struct MemoryStruct)); ms->memory = malloc(1); ms->size = 0; } void C_bufferFinalize(void* data) { struct MemoryStruct ms = (struct MemoryStruct )data; free(ms->memory); } void C_curlAllocate(WrenVM* vm) { CURL pcurl = (CURL)wrenSetSlotNewForeign(vm, 0, 0, sizeof(CURL)); pcurl = curl_easy_init(); } void C_value(WrenVM* vm) { struct MemoryStruct ms = (struct MemoryStruct )wrenGetSlotForeign(vm, 0); /* decompress string before returning to Wren / unsigned int destLen = ms->size 5; // should be more than enough char dest = malloc(destLen); int ret = BZ2_bzBuffToBuffDecompress(dest, &destLen, ms->memory, ms->size, 0, 0); / should get a 'compressed data ends unexpectedly' error here which we ignore but report any other error / if (ret != BZ_UNEXPECTED_EOF && ret != BZ_OK) printf("error number %d occurred", ret); char ptr = realloc(ms->memory, destLen); if(!ptr) { /* out of memory! / printf("not enough memory (realloc returned NULL)\n"); return; } ms->memory = ptr; memcpy(ms->memory, dest, destLen); ms->size = destLen; wrenSetSlotString(vm, 0, ms->memory); free(dest); } void C_easyPerform(WrenVM vm) { CURL* curl = (CURL)wrenGetSlotForeign(vm, 0); curl_easy_perform(curl); } void C_easyCleanup(WrenVM vm) { CURL* curl = (CURL)wrenGetSlotForeign(vm, 0); curl_easy_cleanup(curl); } void C_easySetOpt(WrenVM vm) { CURL* curl = (CURL)wrenGetSlotForeign(vm, 0); CURLoption opt = (CURLoption)wrenGetSlotDouble(vm, 1); if (opt < 10000) { long lparam = (long)wrenGetSlotDouble(vm, 2); curl_easy_setopt(curl, opt, lparam); } else if (opt < 20000) { if (opt == CURLOPT_WRITEDATA) { struct MemoryStruct ms = (struct MemoryStruct )wrenGetSlotForeign(vm, 2); curl_easy_setopt(curl, opt, (void )ms); } else if (opt == CURLOPT_URL) { const char url = wrenGetSlotString(vm, 2); curl_easy_setopt(curl, opt, url); } } else if (opt < 30000) { if (opt == CURLOPT_WRITEFUNCTION) { curl_easy_setopt(curl, opt, &WriteMemoryCallback); } } } WrenForeignClassMethods bindForeignClass(WrenVM vm, const char* module, const char* className) { WrenForeignClassMethods methods; methods.allocate = NULL; methods.finalize = NULL; if (strcmp(module, "main") == 0) { if (strcmp(className, "Buffer") == 0) { methods.allocate = C_bufferAllocate; methods.finalize = C_bufferFinalize; } else if (strcmp(className, "Curl") == 0) { methods.allocate = C_curlAllocate; } } return methods; } WrenForeignMethodFn bindForeignMethod( WrenVM* vm, const char* module, const char* className, bool isStatic, const char* signature) { if (strcmp(module, "main") == 0) { if (strcmp(className, "Buffer") == 0) { if (!isStatic && strcmp(signature, "value") == 0) return C_value; } else if (strcmp(className, "Curl") == 0) { if (!isStatic && strcmp(signature, "easySetOpt(_,_)") == 0) return C_easySetOpt; if (!isStatic && strcmp(signature, "easyPerform()") == 0) return C_easyPerform; if (!isStatic && strcmp(signature, "easyCleanup()") == 0) return C_easyCleanup; } } return NULL; } static void writeFn(WrenVM* vm, const char* text) { printf("%s", text); } void errorFn(WrenVM* vm, WrenErrorType errorType, const char* module, const int line, const char* msg) { switch (errorType) { case WREN_ERROR_COMPILE: printf("[%s line %d] [Error] %s\n", module, line, msg); break; case WREN_ERROR_STACK_TRACE: printf("[%s line %d] in %s\n", module, line, msg); break; case WREN_ERROR_RUNTIME: printf("[Runtime Error] %s\n", msg); break; } } char readFile(const char fileName) { FILE f = fopen(fileName, "r"); fseek(f, 0, SEEK_END); long fsize = ftell(f); rewind(f); char script = malloc(fsize + 1); fread(script, 1, fsize, f); fclose(f); script[fsize] = 0; return script; } static void loadModuleComplete(WrenVM* vm, const char* module, WrenLoadModuleResult result) { if( result.source) free((void)result.source); } WrenLoadModuleResult loadModule(WrenVM vm, const char* name) { WrenLoadModuleResult result = {0}; if (strcmp(name, "random") != 0 && strcmp(name, "meta") != 0) { result.onComplete = loadModuleComplete; char fullName[strlen(name) + 6]; strcpy(fullName, name); strcat(fullName, ".wren"); result.source = readFile(fullName); } return result; } int main(int argc, char *argv) { WrenConfiguration config; wrenInitConfiguration(&config); config.writeFn = &writeFn; config.errorFn = &errorFn; config.bindForeignClassFn = &bindForeignClass; config.bindForeignMethodFn = &bindForeignMethod; config.loadModuleFn = &loadModule; WrenVM vm = wrenNewVM(&config); const char* module = "main"; const char* fileName = "WiktionaryDumps_to_words.wren"; char *script = readFile(fileName); WrenInterpretResult result = wrenInterpret(vm, module, script); switch (result) { case WREN_RESULT_COMPILE_ERROR: printf("Compile Error!\n"); break; case WREN_RESULT_RUNTIME_ERROR: printf("Runtime Error!\n"); break; case WREN_RESULT_SUCCESS: break; } wrenFreeVM(vm); free(script); return 0; }</syntaxhighlight> {{out}} <pre> ~~wget --quiet https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 -O - \| bzcat \| \~~ ~~ocaml str.cma -I $(ocamlfind query xmlm) xmlm.cma to_words.ml~~ gratis gratuit Line 584 ⟶ 1,018: connotation minute trade ~~...~~ adjective adjectival substantive patronage deal merchandise eagle f fa fable a- </pre>