WiktionaryDumps to words: Difference between revisions
Content added Content deleted
m (→{{header|Perl}}: simplify) |
(Added Wren) |
||
Line 718: | Line 718: | ||
pond |
pond |
||
livre |
livre |
||
</pre> |
|||
=={{header|Wren}}== |
|||
{{trans|Julia}} |
|||
{{libheader|libcurl}} |
|||
{{libheader|libbzip2}} |
|||
{{libheader|Wren-pattern}} |
|||
An embedded program so we can use libcurl and libbzip2. |
|||
Rather than downloading the full 800MB .bz2 file and then decompressing it, we abort the download after receiving no more than the first 512 KB and then decompress that ignoring the resultant BZ_UNEXPECTED_EOF error. This turns out to be enough to find the first 26 French words. |
|||
<lang ecmascript>/* wiktionary_dumps_to_words.wren */ |
|||
import "./pattern" for Pattern |
|||
var CURLOPT_URL = 10002 |
|||
var CURLOPT_FOLLOWLOCATION = 52 |
|||
var CURLOPT_WRITEFUNCTION = 20011 |
|||
var CURLOPT_WRITEDATA = 10001 |
|||
foreign class Buffer { |
|||
construct new() {} // C will allocate buffer of a suitable size |
|||
foreign value // returns buffer contents as a string after decompression |
|||
} |
|||
foreign class Curl { |
|||
construct easyInit() {} |
|||
foreign easySetOpt(opt, param) |
|||
foreign easyPerform() |
|||
foreign easyCleanup() |
|||
} |
|||
var curl = Curl.easyInit() |
|||
var getContent = Fn.new { |url| |
|||
var buffer = Buffer.new() |
|||
curl.easySetOpt(CURLOPT_URL, url) |
|||
curl.easySetOpt(CURLOPT_FOLLOWLOCATION, 1) |
|||
curl.easySetOpt(CURLOPT_WRITEFUNCTION, 0) // write function to be supplied by C |
|||
curl.easySetOpt(CURLOPT_WRITEDATA, buffer) |
|||
curl.easyPerform() |
|||
return buffer.value |
|||
} |
|||
var url = "https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2" |
|||
var content = getContent.call(url) |
|||
curl.easyCleanup() |
|||
var lines = content.split("\n") |
|||
var title = "<title>" |
|||
var txtOpen = "<text" |
|||
var txtClose = "</text>" |
|||
var langMark = "==French==" |
|||
var gotTextLast = false |
|||
var titleWord = "" |
|||
var p = Pattern.new("<title>[+1^<]<//title>") |
|||
for (line in lines) { |
|||
if (line.indexOf(title) >= 0) { |
|||
gotTextLast = false |
|||
var m = p.find(line) |
|||
titleWord = m ? m.capsText[0] : "" |
|||
} else if (line.indexOf(txtOpen) >= 0) { |
|||
gotTextLast = true |
|||
} else if (line.indexOf(langMark) >= 0) { |
|||
if (gotTextLast && titleWord != "") System.print(titleWord) |
|||
gotTextLast = false |
|||
} else if (line.indexOf(txtClose) >= 0) { |
|||
gotTextLast = false |
|||
} |
|||
}</lang> |
|||
<br> |
|||
We now embed this script in the following C program, build and run. |
|||
<lang c>/* gcc wiktionary_dumps_to_words.c -o wiktionary_dumps_to_words -lcurl -lbz2 -lwren -lm */ |
|||
#include <stdio.h> |
|||
#include <stdlib.h> |
|||
#include <string.h> |
|||
#include <curl/curl.h> |
|||
#include <bzlib.h> |
|||
#include "wren.h" |
|||
struct MemoryStruct { |
|||
char *memory; |
|||
size_t size; |
|||
}; |
|||
const size_t LIMIT = 512 * 1024; |
|||
/* C <=> Wren interface functions */ |
|||
static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) { |
|||
size_t realsize = size * nmemb; |
|||
struct MemoryStruct *mem = (struct MemoryStruct *)userp; |
|||
size_t size_needed = mem->size + realsize + 1; |
|||
if (size_needed > LIMIT) return -1; // abort download |
|||
char *ptr = realloc(mem->memory, size_needed); |
|||
if(!ptr) { |
|||
/* out of memory! */ |
|||
printf("not enough memory (realloc returned NULL)\n"); |
|||
return 0; |
|||
} |
|||
mem->memory = ptr; |
|||
memcpy(&(mem->memory[mem->size]), contents, realsize); |
|||
mem->size += realsize; |
|||
mem->memory[mem->size] = 0; |
|||
return realsize; |
|||
} |
|||
void C_bufferAllocate(WrenVM* vm) { |
|||
struct MemoryStruct *ms = (struct MemoryStruct *)wrenSetSlotNewForeign(vm, 0, 0, sizeof(struct MemoryStruct)); |
|||
ms->memory = malloc(1); |
|||
ms->size = 0; |
|||
} |
|||
void C_bufferFinalize(void* data) { |
|||
struct MemoryStruct *ms = (struct MemoryStruct *)data; |
|||
free(ms->memory); |
|||
} |
|||
void C_curlAllocate(WrenVM* vm) { |
|||
CURL** pcurl = (CURL**)wrenSetSlotNewForeign(vm, 0, 0, sizeof(CURL*)); |
|||
*pcurl = curl_easy_init(); |
|||
} |
|||
void C_value(WrenVM* vm) { |
|||
struct MemoryStruct *ms = (struct MemoryStruct *)wrenGetSlotForeign(vm, 0); |
|||
/* decompress string before returning to Wren */ |
|||
unsigned int destLen = ms->size * 5; // should be more than enough |
|||
char *dest = malloc(destLen); |
|||
int ret = BZ2_bzBuffToBuffDecompress(dest, &destLen, ms->memory, ms->size, 0, 0); |
|||
/* should get a 'compressed data ends unexpectedly' error here which we ignore |
|||
but report any other error */ |
|||
if (ret != BZ_UNEXPECTED_EOF && ret != BZ_OK) printf("error number %d occurred", ret); |
|||
char *ptr = realloc(ms->memory, destLen); |
|||
if(!ptr) { |
|||
/* out of memory! */ |
|||
printf("not enough memory (realloc returned NULL)\n"); |
|||
return; |
|||
} |
|||
ms->memory = ptr; |
|||
memcpy(ms->memory, dest, destLen); |
|||
ms->size = destLen; |
|||
wrenSetSlotString(vm, 0, ms->memory); |
|||
free(dest); |
|||
} |
|||
void C_easyPerform(WrenVM* vm) { |
|||
CURL* curl = *(CURL**)wrenGetSlotForeign(vm, 0); |
|||
curl_easy_perform(curl); |
|||
} |
|||
void C_easyCleanup(WrenVM* vm) { |
|||
CURL* curl = *(CURL**)wrenGetSlotForeign(vm, 0); |
|||
curl_easy_cleanup(curl); |
|||
} |
|||
void C_easySetOpt(WrenVM* vm) { |
|||
CURL* curl = *(CURL**)wrenGetSlotForeign(vm, 0); |
|||
CURLoption opt = (CURLoption)wrenGetSlotDouble(vm, 1); |
|||
if (opt < 10000) { |
|||
long lparam = (long)wrenGetSlotDouble(vm, 2); |
|||
curl_easy_setopt(curl, opt, lparam); |
|||
} else if (opt < 20000) { |
|||
if (opt == CURLOPT_WRITEDATA) { |
|||
struct MemoryStruct *ms = (struct MemoryStruct *)wrenGetSlotForeign(vm, 2); |
|||
curl_easy_setopt(curl, opt, (void *)ms); |
|||
} else if (opt == CURLOPT_URL) { |
|||
const char *url = wrenGetSlotString(vm, 2); |
|||
curl_easy_setopt(curl, opt, url); |
|||
} |
|||
} else if (opt < 30000) { |
|||
if (opt == CURLOPT_WRITEFUNCTION) { |
|||
curl_easy_setopt(curl, opt, &WriteMemoryCallback); |
|||
} |
|||
} |
|||
} |
|||
WrenForeignClassMethods bindForeignClass(WrenVM* vm, const char* module, const char* className) { |
|||
WrenForeignClassMethods methods; |
|||
methods.allocate = NULL; |
|||
methods.finalize = NULL; |
|||
if (strcmp(module, "main") == 0) { |
|||
if (strcmp(className, "Buffer") == 0) { |
|||
methods.allocate = C_bufferAllocate; |
|||
methods.finalize = C_bufferFinalize; |
|||
} else if (strcmp(className, "Curl") == 0) { |
|||
methods.allocate = C_curlAllocate; |
|||
} |
|||
} |
|||
return methods; |
|||
} |
|||
WrenForeignMethodFn bindForeignMethod( |
|||
WrenVM* vm, |
|||
const char* module, |
|||
const char* className, |
|||
bool isStatic, |
|||
const char* signature) { |
|||
if (strcmp(module, "main") == 0) { |
|||
if (strcmp(className, "Buffer") == 0) { |
|||
if (!isStatic && strcmp(signature, "value") == 0) return C_value; |
|||
} else if (strcmp(className, "Curl") == 0) { |
|||
if (!isStatic && strcmp(signature, "easySetOpt(_,_)") == 0) return C_easySetOpt; |
|||
if (!isStatic && strcmp(signature, "easyPerform()") == 0) return C_easyPerform; |
|||
if (!isStatic && strcmp(signature, "easyCleanup()") == 0) return C_easyCleanup; |
|||
} |
|||
} |
|||
return NULL; |
|||
} |
|||
static void writeFn(WrenVM* vm, const char* text) { |
|||
printf("%s", text); |
|||
} |
|||
void errorFn(WrenVM* vm, WrenErrorType errorType, const char* module, const int line, const char* msg) { |
|||
switch (errorType) { |
|||
case WREN_ERROR_COMPILE: |
|||
printf("[%s line %d] [Error] %s\n", module, line, msg); |
|||
break; |
|||
case WREN_ERROR_STACK_TRACE: |
|||
printf("[%s line %d] in %s\n", module, line, msg); |
|||
break; |
|||
case WREN_ERROR_RUNTIME: |
|||
printf("[Runtime Error] %s\n", msg); |
|||
break; |
|||
} |
|||
} |
|||
char *readFile(const char *fileName) { |
|||
FILE *f = fopen(fileName, "r"); |
|||
fseek(f, 0, SEEK_END); |
|||
long fsize = ftell(f); |
|||
rewind(f); |
|||
char *script = malloc(fsize + 1); |
|||
fread(script, 1, fsize, f); |
|||
fclose(f); |
|||
script[fsize] = 0; |
|||
return script; |
|||
} |
|||
static void loadModuleComplete(WrenVM* vm, const char* module, WrenLoadModuleResult result) { |
|||
if( result.source) free((void*)result.source); |
|||
} |
|||
WrenLoadModuleResult loadModule(WrenVM* vm, const char* name) { |
|||
WrenLoadModuleResult result = {0}; |
|||
if (strcmp(name, "random") != 0 && strcmp(name, "meta") != 0) { |
|||
result.onComplete = loadModuleComplete; |
|||
char fullName[strlen(name) + 6]; |
|||
strcpy(fullName, name); |
|||
strcat(fullName, ".wren"); |
|||
result.source = readFile(fullName); |
|||
} |
|||
return result; |
|||
} |
|||
int main(int argc, char **argv) { |
|||
WrenConfiguration config; |
|||
wrenInitConfiguration(&config); |
|||
config.writeFn = &writeFn; |
|||
config.errorFn = &errorFn; |
|||
config.bindForeignClassFn = &bindForeignClass; |
|||
config.bindForeignMethodFn = &bindForeignMethod; |
|||
config.loadModuleFn = &loadModule; |
|||
WrenVM* vm = wrenNewVM(&config); |
|||
const char* module = "main"; |
|||
const char* fileName = "wiktionary_dumps_to_words.wren"; |
|||
char *script = readFile(fileName); |
|||
WrenInterpretResult result = wrenInterpret(vm, module, script); |
|||
switch (result) { |
|||
case WREN_RESULT_COMPILE_ERROR: |
|||
printf("Compile Error!\n"); |
|||
break; |
|||
case WREN_RESULT_RUNTIME_ERROR: |
|||
printf("Runtime Error!\n"); |
|||
break; |
|||
case WREN_RESULT_SUCCESS: |
|||
break; |
|||
} |
|||
wrenFreeVM(vm); |
|||
free(script); |
|||
return 0; |
|||
}</lang> |
|||
{{out}} |
|||
<pre> |
|||
gratis |
|||
gratuit |
|||
livre |
|||
chien |
|||
pond |
|||
pies |
|||
pie |
|||
A |
|||
connotation |
|||
minute |
|||
trade |
|||
adjective |
|||
adjectival |
|||
substantive |
|||
patronage |
|||
deal |
|||
merchandise |
|||
eagle |
|||
f |
|||
fa |
|||
fable |
|||
a- |
|||
abaca |
|||
abada |
|||
abalone |
|||
abandon |
|||
</pre> |
</pre> |