WiktionaryDumps to words: Difference between revisions

m
→‎{{header|Wren}}: Minor tidy and rerun
(Added OCaml)
m (→‎{{header|Wren}}: Minor tidy and rerun)
 
(34 intermediate revisions by 8 users not shown)
Line 1:
{{draft task}}
Use the [https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 wiktionary dump] to create a file equivalent than [http://manpages.ubuntu.com/manpages/bionic/man5/french.5.html "/usr/share/dict/french"]. Demonstrate how your language can handle this dump which is a big bz2'ed XML file of about 800MB. The "/usr/share/dict/french" file contains one word of the French language by line in a text file. This file is available in Ubuntu with the package '''wfrench'''.
 
;Task:
Make a file that can be useful with [https://en.wikipedia.org/wiki/Spell_checker spell checkers] like [https://fr.wikipedia.org/wiki/Ispell Ispell] and [https://en.wikipedia.org/wiki/GNU_Aspell Aspell].
 
Use the [https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 wiktionary dump] (input) to create a file equivalent to [https://manpages.ubuntu.com/manpages/bionic/man5/spanish.5.html "/usr/share/dict/spanish"] (output). The input file is an XML dump of the Wiktionary that is a bz2'ed file of about 800MB. The output file should be a file similar to "/usr/share/dict/spanish", a simple text file each line of which is one word in the given language. An example of such a file is available in Ubuntu with the package '''wspanish'''.
 
 
=={{header|C}}==
 
<syntaxhighlight lang="c">#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <unistd.h>
 
#include <expat.h>
#include <pcre.h>
 
#ifdef XML_LARGE_SIZE
# define XML_FMT_INT_MOD "ll"
#else
# define XML_FMT_INT_MOD "l"
#endif
 
#ifdef XML_UNICODE_WCHAR_T
# define XML_FMT_STR "ls"
#else
# define XML_FMT_STR "s"
#endif
 
void reset_char_data_buffer();
void process_char_data_buffer();
 
static bool last_tag_is_title;
static bool last_tag_is_text;
 
static pcre *reCompiled;
static pcre_extra *pcreExtra;
 
 
void start_element(void *data, const char *element, const char **attribute) {
process_char_data_buffer();
reset_char_data_buffer();
 
if (strcmp("title", element) == 0) {
last_tag_is_title = true;
}
if (strcmp("text", element) == 0) {
last_tag_is_text = true;
}
}
 
void end_element(void *data, const char *el) {
process_char_data_buffer();
reset_char_data_buffer();
}
 
 
#define TITLE_BUF_SIZE (1024 * 8)
 
static char char_data_buffer[1024 * 64 * 8];
static char title_buffer[TITLE_BUF_SIZE];
static size_t offs;
static bool overflow;
 
 
void reset_char_data_buffer(void) {
offs = 0;
overflow = false;
}
 
// pastes parts of the node together
void char_data(void *userData, const XML_Char *s, int len) {
if (!overflow) {
if (len + offs >= sizeof(char_data_buffer)) {
overflow = true;
fprintf(stderr, "Warning: buffer overflow\n");
fflush(stderr);
} else {
memcpy(char_data_buffer + offs, s, len);
offs += len;
}
}
}
 
void try_match();
 
// if the element is the one we're after
void process_char_data_buffer(void) {
if (offs > 0) {
char_data_buffer[offs] = '\0';
 
if (last_tag_is_title) {
unsigned int n = (offs+1 > TITLE_BUF_SIZE) ? TITLE_BUF_SIZE : (offs+1);
memcpy(title_buffer, char_data_buffer, n);
last_tag_is_title = false;
}
if (last_tag_is_text) {
try_match();
last_tag_is_text = false;
}
}
}
 
void try_match()
{
int subStrVec[80];
int subStrVecLen;
int pcreExecRet;
subStrVecLen = sizeof(subStrVec) / sizeof(int);
 
pcreExecRet = pcre_exec(
reCompiled, pcreExtra,
char_data_buffer, strlen(char_data_buffer),
0, 0,
subStrVec, subStrVecLen);
 
if (pcreExecRet < 0) {
switch (pcreExecRet) {
case PCRE_ERROR_NOMATCH : break;
case PCRE_ERROR_NULL : fprintf(stderr, "Something was null\n"); break;
case PCRE_ERROR_BADOPTION : fprintf(stderr, "A bad option was passed\n"); break;
case PCRE_ERROR_BADMAGIC : fprintf(stderr, "Magic number bad (compiled re corrupt?)\n"); break;
case PCRE_ERROR_UNKNOWN_NODE : fprintf(stderr, "Something kooky in the compiled re\n"); break;
case PCRE_ERROR_NOMEMORY : fprintf(stderr, "Ran out of memory\n"); break;
default : fprintf(stderr, "Unknown error\n"); break;
}
} else {
puts(title_buffer); // print the word
}
}
 
 
#define BUF_SIZE 1024
 
int main(int argc, char *argv[])
{
char buffer[BUF_SIZE];
int n;
 
const char *pcreErrorStr;
int pcreErrorOffset;
char *aStrRegex;
char **aLineToMatch;
 
// Using PCRE
 
aStrRegex = "(.*)(==French==)(.*)"; // search for French language
 
reCompiled = pcre_compile(aStrRegex, PCRE_DOTALL | PCRE_UTF8, &pcreErrorStr, &pcreErrorOffset, NULL);
if (reCompiled == NULL) {
fprintf(stderr, "ERROR: Could not compile regex '%s': %s\n", aStrRegex, pcreErrorStr);
exit(1);
}
 
pcreExtra = pcre_study(reCompiled, 0, &pcreErrorStr);
if (pcreErrorStr != NULL) {
fprintf(stderr, "ERROR: Could not study regex '%s': %s\n", aStrRegex, pcreErrorStr);
exit(1);
}
 
// Using Expat parser
 
XML_Parser parser = XML_ParserCreate(NULL);
 
XML_SetElementHandler(parser, start_element, end_element);
XML_SetCharacterDataHandler(parser, char_data);
 
reset_char_data_buffer();
 
while (1) {
int done;
int len;
 
len = (int)fread(buffer, 1, BUF_SIZE, stdin);
if (ferror(stdin)) {
fprintf(stderr, "Read error\n");
exit(1);
}
done = feof(stdin);
 
if (XML_Parse(parser, buffer, len, done) == XML_STATUS_ERROR) {
fprintf(stderr,
"Parse error at line %" XML_FMT_INT_MOD "u:\n%" XML_FMT_STR "\n",
XML_GetCurrentLineNumber(parser),
XML_ErrorString(XML_GetErrorCode(parser)));
exit(1);
}
 
if (done) break;
}
 
XML_ParserFree(parser);
 
pcre_free(reCompiled);
 
if (pcreExtra != NULL) {
#ifdef PCRE_CONFIG_JIT
pcre_free_study(pcreExtra);
#else
pcre_free(pcreExtra);
#endif
}
 
return 0;
}</syntaxhighlight>
 
{{out}}
 
<pre>
$ gcc wikt_to_words.c -o wikt_to_words -lpcre -lexpat
$ wget --quiet https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 -O - | bzcat | \
./wikt_to_words
gratis
gratuit
livre
chien
pond
pies
pie
A
connotation
minute
...
</pre>
 
 
=={{header|Java}}==
 
<syntaxhighlight lang="java">import org.xml.sax.*;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.SAXException;
 
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.parsers.ParserConfigurationException;
 
import java.util.regex.Pattern;
import java.util.regex.Matcher;
 
class MyHandler extends DefaultHandler {
private static final String TITLE = "title";
private static final String TEXT = "text";
 
private String lastTag = "";
private String title = "";
 
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
String regex = ".*==French==.*";
Pattern pat = Pattern.compile(regex, Pattern.DOTALL);
 
switch (lastTag) {
case TITLE:
title = new String(ch, start, length);
break;
case TEXT:
String text = new String(ch, start, length);
Matcher mat = pat.matcher(text);
if (mat.matches()) {
System.out.println(title);
}
break;
}
}
 
@Override
public void startElement(String uri, String localName, String qName, Attributes attrs) throws SAXException {
lastTag = qName;
}
 
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
lastTag = "";
}
}
 
public class WiktoWords {
public static void main(java.lang.String[] args) {
try {
SAXParserFactory spFactory = SAXParserFactory.newInstance();
SAXParser saxParser = spFactory.newSAXParser();
MyHandler handler = new MyHandler();
saxParser.parse(new InputSource(System.in), handler);
} catch(Exception e) {
System.exit(1);
}
}
}</syntaxhighlight>
 
{{out}}
 
<pre>
$ javac WiktoWords.java
$ wget --quiet https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 -O - | bzcat | \
java WiktoWords
gratis
gratuit
livre
chien
pond
pies
pie
A
connotation
minute
...
</pre>
 
=={{header|Julia}}==
Uses Regex and a state variable instead of XML parsing. Default setting prints the first 80 French words found.
<syntaxhighlight lang="julia">using CodecBzip2
 
function getwords(io::IO, output::IO; languagemark = "==French==", maxwords = 80)
title, txopen, txclose = "<title>", "<text", "</text>"
got_text_last = false
wordcount, titleword = 0, ""
for line in eachline(io)
if occursin(title, line)
got_text_last = false
titleword = (m = match(r"<title>([^<]+)</title>", line)) != nothing ? m[1] : ""
elseif occursin(txopen, line)
got_text_last = true
elseif occursin(languagemark, line)
if got_text_last && titleword != ""
println(output, titleword)
(wordcount += 1) >= maxwords && break
end
got_text_last = false
elseif occursin(txclose, line)
got_text_last = false
end
end
end
 
const url = "https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2"
const urlfile = "wikidump.bz2"
stat(urlfile).size == 0 && download(url, urlfile)
const stream = Bzip2DecompressorStream(open(urlfile))
getwords(stream, stdout) # or open a file to write to and use its IO handle instead of stdout
 
</syntaxhighlight>{{out}}
<pre>
gratis
gratuit
livre
chien
pond
pies
pie
A
connotation
minute
trade
adjective
adjectival
substantive
patronage
merchandise
eagle
fa
fable
a-
abaca
abada
abalone
abandon
abattoir
abaxial
abbatial
abdication
abdicative
abdomen
abdominal
abdominales
abduction
aberrance
aberrant
aberration
abhorrent
abhorrer
abime
abject
abjection
abjuration
abjure
abjurer
ablactation
ablation
ablative
able
abluent
ablution
abolition
abominable
abomination
abord
abortive
about
abracadabra
abrase
abrasion
abrasive
abraxas
abreuvoir
abrogation
abrogative
abrupt
on
abscission
abscond
absconder
quiz
nu
lente
été
servant
robot
y
absent
absenter
absolution
absorbable
</pre>
 
 
=={{header|OCaml}}==
 
Using the library [http://erratique.ch/software/xmlm xmlm]:
<lang ocaml>let () =
 
<syntaxhighlight lang="ocaml">let () =
let i = Xmlm.make_input ~strip:true (`Channel stdin) in
let title = ref "" in
Line 17 ⟶ 444:
match !tag_path with [] -> false
| hd :: _ -> hd = tag
in
let reg = Str.regexp_string "==French==" in
let matches s =
try let _ = Str.search_forward reg s 0 in true
with Not_found -> false
in
while not (Xmlm.eoi i) do
Line 28 ⟶ 460:
if last_tag_is "text"
then begin
letif regmatches = Str.regexp_string "==French==" ins
if Str.string_match reg s 0
then print_endline !title
end
done</langsyntaxhighlight>
 
{{out}}
 
<pre>
wget --quiet https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 -O - | bzcat | \
ocaml str.cma -I $(ocamlfind query xmlm) xmlm.cma to_words.ml
gratis
gratuit
livre
chien
pond
pies
pie
A
connotation
minute
...
</pre>
 
=={{header|Perl}}==
{{trans|Raku}}
<syntaxhighlight lang="perl"># 20211214 Perl programming solution
 
use strict;
use warnings;
use LWP::UserAgent;
use Compress::Raw::Bzip2 ;
 
my $LanguageMark = "==French==";
my $Target = 5; # words
my $URL = 'https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2';
 
my %needles; my $plain = my $tail = '';
my $ua = LWP::UserAgent->new;
my $bz = new Compress::Raw::Bunzip2({ -Bufsize => 1, -AppendOutput => 0 });
 
my $res = $ua->request( HTTP::Request->new(GET => $URL),
sub { # @_ = Data Chunk, HTTP::Response
foreach (split '', $_[0]) {
my $status = $bz->bzinflate($_, substr($plain, 0)) ;
last if $status == BZ_STREAM_END or $status != BZ_OK ;
}
 
if ( scalar ( my @haystacks = split "\n", $plain)) {
$haystacks[0] = $tail . $haystacks[0];
$tail = $haystacks[$#haystacks];
my ($title,$got_text_last) = '', 0 ;
foreach ( @haystacks[0..$#haystacks-1] ) {
if ( /<title>(\w+?)<\/title>/ ) {
($title,$got_text_last) = $1, 0;
} elsif ( /<text/ ) {
$got_text_last = 1;
} elsif ( /$LanguageMark/ ) {
$needles{$title}++ if ( $got_text_last and $title.defined );
if ( %needles >= $Target ) {
print "$_\n" for sort keys %needles;
exit;
}
$got_text_last = 0;
} elsif ( /<\/text>/ ) { $got_text_last = 0 }
}
}
}
)</syntaxhighlight>
{{out}}
<pre>
chien
gratis
gratuit
livre
pond
</pre>
 
=={{header|Phix}}==
Does not rely on wget/bzcat etc. Downloads in 16K or so blocks, unpacks one block at a time in memory, terminates properly when 5 or more words are found.<br>
Tested on Windows, should be fine on Linux as long as you can provide a suitable bz2.so
<!--<syntaxhighlight lang="phix">(notonline)-->
<span style="color: #008080;">constant</span> <span style="color: #000000;">url</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">"https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2"</span>
<span style="color: #008080;">include</span> <span style="color: #000000;">builtins</span><span style="color: #0000FF;">/</span><span style="color: #000000;">libcurl</span><span style="color: #0000FF;">.</span><span style="color: #000000;">e</span>
<span style="color: #008080;">include</span> <span style="color: #000000;">builtins</span><span style="color: #0000FF;">/</span><span style="color: #000000;">bzstream</span><span style="color: #0000FF;">.</span><span style="color: #000000;">e</span>
<span style="color: #004080;">bool</span> <span style="color: #000000;">got_text_last</span> <span style="color: #0000FF;">=</span> <span style="color: #004600;">false</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">wordcount</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">titleword</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">doline</span><span style="color: #0000FF;">(</span><span style="color: #004080;">object</span> <span style="color: #000000;">line</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #008080;">not</span> <span style="color: #004080;">string</span><span style="color: #0000FF;">(</span><span style="color: #000000;">line</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
<span style="color: #000080;font-style:italic;">-- (opt: close output file)</span>
<span style="color: #008080;">return</span> <span style="color: #004600;">false</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">k</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">match</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"&lt;title&gt;"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">line</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">k</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">got_text_last</span> <span style="color: #0000FF;">=</span> <span style="color: #004600;">false</span>
<span style="color: #000000;">k</span> <span style="color: #0000FF;">+=</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"&lt;title&gt;"</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">l</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">match</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"&lt;/title&gt;"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">line</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">k</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">titleword</span> <span style="color: #0000FF;">=</span> <span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #000000;">l</span><span style="color: #0000FF;">?</span><span style="color: #000000;">line</span><span style="color: #0000FF;">[</span><span style="color: #000000;">k</span><span style="color: #0000FF;">..</span><span style="color: #000000;">l</span><span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]:</span><span style="color: #008000;">""</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">elsif</span> <span style="color: #7060A8;">match</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"&lt;text"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">line</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">got_text_last</span> <span style="color: #0000FF;">=</span> <span style="color: #004600;">true</span>
<span style="color: #008080;">elsif</span> <span style="color: #7060A8;">match</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"==French=="</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">line</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">got_text_last</span> <span style="color: #008080;">and</span> <span style="color: #000000;">titleword</span> <span style="color: #0000FF;">!=</span> <span style="color: #008000;">""</span> <span style="color: #008080;">then</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%s\n"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">titleword</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">wordcount</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">wordcount</span> <span style="color: #0000FF;">>=</span> <span style="color: #000000;">5</span> <span style="color: #008080;">then</span>
<span style="color: #000080;font-style:italic;">-- (opt: close output file)</span>
<span style="color: #008080;">return</span> <span style="color: #004600;">false</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">got_text_last</span> <span style="color: #0000FF;">=</span> <span style="color: #004600;">false</span>
<span style="color: #008080;">elsif</span> <span style="color: #7060A8;">match</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"&lt;/text&gt;"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">line</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">got_text_last</span> <span style="color: #0000FF;">=</span> <span style="color: #004600;">false</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #004600;">true</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #004080;">atom</span> <span style="color: #000000;">tbr</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- Total Bytes Written</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">demiline</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span>
<span style="color: #008080;">constant</span> <span style="color: #000000;">BLOCKSIZE</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">8192</span>
<span style="color: #004080;">atom</span> <span style="color: #000000;">outbuf</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">allocate</span><span style="color: #0000FF;">(</span><span style="color: #000000;">BLOCKSIZE</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">write_callback</span><span style="color: #0000FF;">(</span><span style="color: #004080;">atom</span> <span style="color: #000000;">pData</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">size</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">nmemb</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">atom</span> <span style="color: #000000;">pUserdata</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">bytes_written</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">size</span><span style="color: #0000FF;">*</span><span style="color: #000000;">nmemb</span>
<span style="color: #000000;">tbr</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">bytes_written</span>
<span style="color: #000000;">set_struct_field</span><span style="color: #0000FF;">(</span><span style="color: #000000;">id_bzs</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p_bzs</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"next_in"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">pData</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">set_struct_field</span><span style="color: #0000FF;">(</span><span style="color: #000000;">id_bzs</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p_bzs</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"avail_in"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">bytes_written</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">set_struct_field</span><span style="color: #0000FF;">(</span><span style="color: #000000;">id_bzs</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p_bzs</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"next_out"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">outbuf</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">set_struct_field</span><span style="color: #0000FF;">(</span><span style="color: #000000;">id_bzs</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p_bzs</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"avail_out"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BLOCKSIZE</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">while</span> <span style="color: #004600;">true</span> <span style="color: #008080;">do</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">BZ2_bzDecompress</span><span style="color: #0000FF;">(),</span>
<span style="color: #000000;">avail_in</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">get_struct_field</span><span style="color: #0000FF;">(</span><span style="color: #000000;">id_bzs</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p_bzs</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"avail_in"</span><span style="color: #0000FF;">),</span>
<span style="color: #000000;">avail_out</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">get_struct_field</span><span style="color: #0000FF;">(</span><span style="color: #000000;">id_bzs</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p_bzs</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"avail_out"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">avail_out</span><span style="color: #0000FF;"><</span><span style="color: #000000;">BLOCKSIZE</span> <span style="color: #008080;">then</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">block</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">demiline</span> <span style="color: #0000FF;">&</span> <span style="color: #7060A8;">peek</span><span style="color: #0000FF;">({</span><span style="color: #000000;">outbuf</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BLOCKSIZE</span><span style="color: #0000FF;">-</span><span style="color: #000000;">avail_out</span><span style="color: #0000FF;">})</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">linestart</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span>
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">block</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">block</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]=</span><span style="color: #008000;">'\n'</span> <span style="color: #008080;">then</span>
<span style="color: #008080;">if</span> <span style="color: #008080;">not</span> <span style="color: #000000;">doline</span><span style="color: #0000FF;">(</span><span style="color: #000000;">block</span><span style="color: #0000FF;">[</span><span style="color: #000000;">linestart</span><span style="color: #0000FF;">..</span><span style="color: #000000;">i</span><span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">])</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">BZ2_bzDecompressEnd</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- terminate download</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">linestart</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<span style="color: #000000;">demiline</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">block</span><span style="color: #0000FF;">[</span><span style="color: #000000;">linestart</span><span style="color: #0000FF;">..$]</span>
<span style="color: #000000;">set_struct_field</span><span style="color: #0000FF;">(</span><span style="color: #000000;">id_bzs</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p_bzs</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"next_out"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">outbuf</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">set_struct_field</span><span style="color: #0000FF;">(</span><span style="color: #000000;">id_bzs</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p_bzs</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"avail_out"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BLOCKSIZE</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">=</span><span style="color: #000000;">BZ_STREAM_END</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">BZ2_bzDecompressEnd</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">0</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">BZ_OK</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">avail_in</span><span style="color: #0000FF;">=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span> <span style="color: #008080;">exit</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">bytes_written</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">constant</span> <span style="color: #000000;">write_cb</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">call_back</span><span style="color: #0000FF;">({</span><span style="color: #008000;">'+'</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">routine_id</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"write_callback"</span><span style="color: #0000FF;">)})</span>
<span style="color: #004080;">atom</span> <span style="color: #000000;">curl</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">curl_easy_init</span><span style="color: #0000FF;">()</span>
<span style="color: #7060A8;">curl_easy_setopt</span><span style="color: #0000FF;">(</span><span style="color: #000000;">curl</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">CURLOPT_WRITEFUNCTION</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">write_cb</span><span style="color: #0000FF;">)</span>
<span style="color: #7060A8;">curl_easy_setopt</span><span style="color: #0000FF;">(</span><span style="color: #000000;">curl</span><span style="color: #0000FF;">,</span> <span style="color: #004600;">CURLOPT_URL</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">url</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">BZ2_bzDecompressInit</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">curl_easy_perform</span><span style="color: #0000FF;">(</span><span style="color: #000000;">curl</span><span style="color: #0000FF;">)</span>
<span style="color: #7060A8;">curl_easy_cleanup</span><span style="color: #0000FF;">(</span><span style="color: #000000;">curl</span><span style="color: #0000FF;">)</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"Total downloaded: %s\n"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">file_size_k</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tbr</span><span style="color: #0000FF;">)})</span>
<!--</syntaxhighlight>-->
{{out}}
<pre>
gratis
gratuit
livre
chien
pond
Total downloaded: 239.67KB
</pre>
 
=={{header|Raku}}==
I misunderstood the data format and now just copy verbatim from Julia entry the processing logics ..
<syntaxhighlight lang="raku" line># 20211209 Raku programming solution
 
use LWP::Simple;
use Compress::Bzip2;
use IO::Socket::SSL;
 
my $LanguageMark = "==French==";
my $Target = 5; # words
my $URL = 'https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2';
 
class CustomLWP is LWP::Simple { has $.URL ;
 
method CustomRequest {
 
my Blob $resp = Buf.new;
my $bzip = Compress::Bzip2::Stream.new;
my ( $tail, %needles ) = '';
 
my ($host, $port, $path) = self.parse_url($.URL)[1..3];
my $sock = IO::Socket::SSL.new: :$host, :$port;
 
$sock.print( "GET {$path} HTTP/1.1\r\n" ~ self.stringify_headers( {
'Connection' => 'close',
'User-Agent' => "LWP::Simple/{LWP::Simple::<$VERSION>} " ~
"Raku/{$*RAKU.compiler.gist}",
'Host' => $host
} ) ~ "\r\n" ) or die ; # request string
 
while !self.got-header($resp) { $resp ~= $sock.read(2048) }
 
my $bzip-stream = supply {
emit self.parse_response($resp)[2]; # $resp_content @ parent class
loop {
done if %needles.elems >= $Target ;
( my $chunk = $sock.read(4096) ) ?? emit $chunk !! done
}
}
 
react {
whenever $bzip-stream -> $crypt {
my $plain = ( [~] $bzip.decompress: $crypt ).decode('utf8-c8');
my @haystacks = $plain.split: "\n";
@haystacks[0] = $tail ~ @haystacks[0];
$tail = @haystacks[*-1];
 
my ($title,$got_text_last) = '', False ;
 
for @haystacks[0..*-2] {
if / '<title>' (\w+?) '</title>' / {
($title,$got_text_last) = $0, False;
} elsif / '<text' / {
$got_text_last = True
} elsif / $LanguageMark / {
%needles{$title}++ if ( $got_text_last and $title.Bool );
last if ( %needles.elems >= $Target ) ;
$got_text_last = False;
} elsif / '</text>' / { $got_text_last = False }
}
}
}
return %needles.keys[^$Target]
}
}
 
my $ua = CustomLWP.new: URL => $URL ;
 
$ua.CustomRequest>>.say</syntaxhighlight>
{{out}}
<pre>
chien
gratuit
gratis
pond
livre
</pre>
 
=={{header|Wren}}==
{{trans|Julia}}
{{libheader|libcurl}}
{{libheader|libbzip2}}
{{libheader|Wren-pattern}}
An embedded program so we can use libcurl and libbzip2.
 
Rather than downloading the full 800MB .bz2 file and then decompressing it, we abort the download after receiving no more than the first 512 KB and then decompress that ignoring the resultant BZ_UNEXPECTED_EOF error. This turns out to be enough to find the first 22 French words.
<syntaxhighlight lang="wren">/* WiktionaryDumps_to_words.wren */
 
import "./pattern" for Pattern
 
var CURLOPT_URL = 10002
var CURLOPT_FOLLOWLOCATION = 52
var CURLOPT_WRITEFUNCTION = 20011
var CURLOPT_WRITEDATA = 10001
 
foreign class Buffer {
construct new() {} // C will allocate buffer of a suitable size
 
foreign value // returns buffer contents as a string after decompression
}
 
foreign class Curl {
construct easyInit() {}
 
foreign easySetOpt(opt, param)
 
foreign easyPerform()
 
foreign easyCleanup()
}
 
var curl = Curl.easyInit()
 
var getContent = Fn.new { |url|
var buffer = Buffer.new()
curl.easySetOpt(CURLOPT_URL, url)
curl.easySetOpt(CURLOPT_FOLLOWLOCATION, 1)
curl.easySetOpt(CURLOPT_WRITEFUNCTION, 0) // write function to be supplied by C
curl.easySetOpt(CURLOPT_WRITEDATA, buffer)
curl.easyPerform()
return buffer.value
}
 
var url = "https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2"
var content = getContent.call(url)
curl.easyCleanup()
var lines = content.split("\n")
var title = "<title>"
var txtOpen = "<text"
var txtClose = "</text>"
var langMark = "==French=="
var gotTextLast = false
var titleWord = ""
var p = Pattern.new("<title>[+1^<]<//title>")
for (line in lines) {
if (line.indexOf(title) >= 0) {
gotTextLast = false
var m = p.find(line)
titleWord = m ? m.capsText[0] : ""
} else if (line.indexOf(txtOpen) >= 0) {
gotTextLast = true
} else if (line.indexOf(langMark) >= 0) {
if (gotTextLast && titleWord != "") System.print(titleWord)
gotTextLast = false
} else if (line.indexOf(txtClose) >= 0) {
gotTextLast = false
}
}</syntaxhighlight>
<br>
We now embed this script in the following C program, build and run.
<syntaxhighlight lang="c">/* gcc WiktionaryDumps_to_words.c -o WiktionaryDumps_to_words -lcurl -lbz2 -lwren -lm */
 
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>
#include <bzlib.h>
#include "wren.h"
 
struct MemoryStruct {
char *memory;
size_t size;
};
 
const size_t LIMIT = 512 * 1024;
 
/* C <=> Wren interface functions */
 
static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {
size_t realsize = size * nmemb;
struct MemoryStruct *mem = (struct MemoryStruct *)userp;
size_t size_needed = mem->size + realsize + 1;
if (size_needed > LIMIT) return -1; // abort download
 
char *ptr = realloc(mem->memory, size_needed);
if(!ptr) {
/* out of memory! */
printf("not enough memory (realloc returned NULL)\n");
return 0;
}
 
mem->memory = ptr;
memcpy(&(mem->memory[mem->size]), contents, realsize);
mem->size += realsize;
mem->memory[mem->size] = 0;
return realsize;
}
 
void C_bufferAllocate(WrenVM* vm) {
struct MemoryStruct *ms = (struct MemoryStruct *)wrenSetSlotNewForeign(vm, 0, 0, sizeof(struct MemoryStruct));
ms->memory = malloc(1);
ms->size = 0;
}
 
void C_bufferFinalize(void* data) {
struct MemoryStruct *ms = (struct MemoryStruct *)data;
free(ms->memory);
}
 
void C_curlAllocate(WrenVM* vm) {
CURL** pcurl = (CURL**)wrenSetSlotNewForeign(vm, 0, 0, sizeof(CURL*));
*pcurl = curl_easy_init();
}
 
void C_value(WrenVM* vm) {
struct MemoryStruct *ms = (struct MemoryStruct *)wrenGetSlotForeign(vm, 0);
/* decompress string before returning to Wren */
unsigned int destLen = ms->size * 5; // should be more than enough
char *dest = malloc(destLen);
int ret = BZ2_bzBuffToBuffDecompress(dest, &destLen, ms->memory, ms->size, 0, 0);
/* should get a 'compressed data ends unexpectedly' error here which we ignore
but report any other error */
if (ret != BZ_UNEXPECTED_EOF && ret != BZ_OK) printf("error number %d occurred", ret);
char *ptr = realloc(ms->memory, destLen);
if(!ptr) {
/* out of memory! */
printf("not enough memory (realloc returned NULL)\n");
return;
}
ms->memory = ptr;
memcpy(ms->memory, dest, destLen);
ms->size = destLen;
wrenSetSlotString(vm, 0, ms->memory);
free(dest);
}
 
void C_easyPerform(WrenVM* vm) {
CURL* curl = *(CURL**)wrenGetSlotForeign(vm, 0);
curl_easy_perform(curl);
}
 
void C_easyCleanup(WrenVM* vm) {
CURL* curl = *(CURL**)wrenGetSlotForeign(vm, 0);
curl_easy_cleanup(curl);
}
 
void C_easySetOpt(WrenVM* vm) {
CURL* curl = *(CURL**)wrenGetSlotForeign(vm, 0);
CURLoption opt = (CURLoption)wrenGetSlotDouble(vm, 1);
if (opt < 10000) {
long lparam = (long)wrenGetSlotDouble(vm, 2);
curl_easy_setopt(curl, opt, lparam);
} else if (opt < 20000) {
if (opt == CURLOPT_WRITEDATA) {
struct MemoryStruct *ms = (struct MemoryStruct *)wrenGetSlotForeign(vm, 2);
curl_easy_setopt(curl, opt, (void *)ms);
} else if (opt == CURLOPT_URL) {
const char *url = wrenGetSlotString(vm, 2);
curl_easy_setopt(curl, opt, url);
}
} else if (opt < 30000) {
if (opt == CURLOPT_WRITEFUNCTION) {
curl_easy_setopt(curl, opt, &WriteMemoryCallback);
}
}
}
 
WrenForeignClassMethods bindForeignClass(WrenVM* vm, const char* module, const char* className) {
WrenForeignClassMethods methods;
methods.allocate = NULL;
methods.finalize = NULL;
if (strcmp(module, "main") == 0) {
if (strcmp(className, "Buffer") == 0) {
methods.allocate = C_bufferAllocate;
methods.finalize = C_bufferFinalize;
} else if (strcmp(className, "Curl") == 0) {
methods.allocate = C_curlAllocate;
}
}
return methods;
}
 
WrenForeignMethodFn bindForeignMethod(
WrenVM* vm,
const char* module,
const char* className,
bool isStatic,
const char* signature) {
if (strcmp(module, "main") == 0) {
if (strcmp(className, "Buffer") == 0) {
if (!isStatic && strcmp(signature, "value") == 0) return C_value;
} else if (strcmp(className, "Curl") == 0) {
if (!isStatic && strcmp(signature, "easySetOpt(_,_)") == 0) return C_easySetOpt;
if (!isStatic && strcmp(signature, "easyPerform()") == 0) return C_easyPerform;
if (!isStatic && strcmp(signature, "easyCleanup()") == 0) return C_easyCleanup;
}
}
return NULL;
}
 
static void writeFn(WrenVM* vm, const char* text) {
printf("%s", text);
}
 
void errorFn(WrenVM* vm, WrenErrorType errorType, const char* module, const int line, const char* msg) {
switch (errorType) {
case WREN_ERROR_COMPILE:
printf("[%s line %d] [Error] %s\n", module, line, msg);
break;
case WREN_ERROR_STACK_TRACE:
printf("[%s line %d] in %s\n", module, line, msg);
break;
case WREN_ERROR_RUNTIME:
printf("[Runtime Error] %s\n", msg);
break;
}
}
 
char *readFile(const char *fileName) {
FILE *f = fopen(fileName, "r");
fseek(f, 0, SEEK_END);
long fsize = ftell(f);
rewind(f);
char *script = malloc(fsize + 1);
fread(script, 1, fsize, f);
fclose(f);
script[fsize] = 0;
return script;
}
 
static void loadModuleComplete(WrenVM* vm, const char* module, WrenLoadModuleResult result) {
if( result.source) free((void*)result.source);
}
 
WrenLoadModuleResult loadModule(WrenVM* vm, const char* name) {
WrenLoadModuleResult result = {0};
if (strcmp(name, "random") != 0 && strcmp(name, "meta") != 0) {
result.onComplete = loadModuleComplete;
char fullName[strlen(name) + 6];
strcpy(fullName, name);
strcat(fullName, ".wren");
result.source = readFile(fullName);
}
return result;
}
 
int main(int argc, char **argv) {
WrenConfiguration config;
wrenInitConfiguration(&config);
config.writeFn = &writeFn;
config.errorFn = &errorFn;
config.bindForeignClassFn = &bindForeignClass;
config.bindForeignMethodFn = &bindForeignMethod;
config.loadModuleFn = &loadModule;
WrenVM* vm = wrenNewVM(&config);
const char* module = "main";
const char* fileName = "WiktionaryDumps_to_words.wren";
char *script = readFile(fileName);
WrenInterpretResult result = wrenInterpret(vm, module, script);
switch (result) {
case WREN_RESULT_COMPILE_ERROR:
printf("Compile Error!\n");
break;
case WREN_RESULT_RUNTIME_ERROR:
printf("Runtime Error!\n");
break;
case WREN_RESULT_SUCCESS:
break;
}
wrenFreeVM(vm);
free(script);
return 0;
}</syntaxhighlight>
 
{{out}}
<pre>
gratis
gratuit
livre
chien
pond
pies
pie
A
connotation
minute
trade
adjective
adjectival
substantive
patronage
deal
merchandise
eagle
f
fa
fable
a-
</pre>
9,476

edits