WiktionaryDumps to words: Difference between revisions
Content added Content deleted
(→{{header|Raku}}: revised) |
m (→{{header|Phix}}: fixed syntax colouring hiccup ("block" as a keyword, which it isn't)) |
||
Line 486: | Line 486: | ||
Does not rely on wget/bzcat etc. Downloads in 16K or so blocks, unpacks one block at a time in memory, terminates properly when 5 or more words are found.<br> |
Does not rely on wget/bzcat etc. Downloads in 16K or so blocks, unpacks one block at a time in memory, terminates properly when 5 or more words are found.<br> |
||
Tested on Windows, should be fine on Linux as long as you can provide a suitable bz2.so |
Tested on Windows, should be fine on Linux as long as you can provide a suitable bz2.so |
||
<!--<lang Phix>( |
<!--<lang Phix>(phixonline)--> |
||
<span style="color: #008080;">constant</span> <span style="color: #000000;">url</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">"https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2"</span> |
<span style="color: #008080;">constant</span> <span style="color: #000000;">url</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">"https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2"</span> |
||
Line 542: | Line 542: | ||
<span style="color: #000000;">avail_out</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">get_struct_field</span><span style="color: #0000FF;">(</span><span style="color: #000000;">id_bzs</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p_bzs</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"avail_out"</span><span style="color: #0000FF;">)</span> |
<span style="color: #000000;">avail_out</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">get_struct_field</span><span style="color: #0000FF;">(</span><span style="color: #000000;">id_bzs</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p_bzs</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"avail_out"</span><span style="color: #0000FF;">)</span> |
||
<span style="color: #008080;">if</span> <span style="color: #000000;">avail_out</span><span style="color: #0000FF;"><</span><span style="color: #000000;">BLOCKSIZE</span> <span style="color: #008080;">then</span> |
<span style="color: #008080;">if</span> <span style="color: #000000;">avail_out</span><span style="color: #0000FF;"><</span><span style="color: #000000;">BLOCKSIZE</span> <span style="color: #008080;">then</span> |
||
<span style="color: #004080;">string</span> <span style="color: # |
<span style="color: #004080;">string</span> <span style="color: #000000;">block</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">demiline</span> <span style="color: #0000FF;">&</span> <span style="color: #7060A8;">peek</span><span style="color: #0000FF;">({</span><span style="color: #000000;">outbuf</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BLOCKSIZE</span><span style="color: #0000FF;">-</span><span style="color: #000000;">avail_out</span><span style="color: #0000FF;">})</span> |
||
<span style="color: #004080;">integer</span> <span style="color: #000000;">linestart</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span> |
<span style="color: #004080;">integer</span> <span style="color: #000000;">linestart</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span> |
||
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: # |
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">block</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span> |
||
<span style="color: #008080;">if</span> <span style="color: # |
<span style="color: #008080;">if</span> <span style="color: #000000;">block</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]=</span><span style="color: #008000;">'\n'</span> <span style="color: #008080;">then</span> |
||
<span style="color: #008080;">if</span> <span style="color: #008080;">not</span> <span style="color: #000000;">doline</span><span style="color: #0000FF;">(</span><span style="color: # |
<span style="color: #008080;">if</span> <span style="color: #008080;">not</span> <span style="color: #000000;">doline</span><span style="color: #0000FF;">(</span><span style="color: #000000;">block</span><span style="color: #0000FF;">[</span><span style="color: #000000;">linestart</span><span style="color: #0000FF;">..</span><span style="color: #000000;">i</span><span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">])</span> <span style="color: #008080;">then</span> |
||
<span style="color: #000000;">BZ2_bzDecompressEnd</span><span style="color: #0000FF;">()</span> |
<span style="color: #000000;">BZ2_bzDecompressEnd</span><span style="color: #0000FF;">()</span> |
||
<span style="color: #008080;">return</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- terminate download</span> |
<span style="color: #008080;">return</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- terminate download</span> |
||
Line 553: | Line 553: | ||
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span> |
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span> |
||
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span> |
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span> |
||
<span style="color: #000000;">demiline</span> <span style="color: #0000FF;">=</span> <span style="color: # |
<span style="color: #000000;">demiline</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">block</span><span style="color: #0000FF;">[</span><span style="color: #000000;">linestart</span><span style="color: #0000FF;">..$]</span> |
||
<span style="color: #000000;">set_struct_field</span><span style="color: #0000FF;">(</span><span style="color: #000000;">id_bzs</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p_bzs</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"next_out"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">outbuf</span><span style="color: #0000FF;">)</span> |
<span style="color: #000000;">set_struct_field</span><span style="color: #0000FF;">(</span><span style="color: #000000;">id_bzs</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p_bzs</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"next_out"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">outbuf</span><span style="color: #0000FF;">)</span> |
||
<span style="color: #000000;">set_struct_field</span><span style="color: #0000FF;">(</span><span style="color: #000000;">id_bzs</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p_bzs</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"avail_out"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BLOCKSIZE</span><span style="color: #0000FF;">)</span> |
<span style="color: #000000;">set_struct_field</span><span style="color: #0000FF;">(</span><span style="color: #000000;">id_bzs</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p_bzs</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"avail_out"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BLOCKSIZE</span><span style="color: #0000FF;">)</span> |