WiktionaryDumps to words: Difference between revisions

From Rosetta Code
Content added Content deleted
(Draft Task)
 
(Added OCaml)
Line 1: Line 1:
Use the [https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 wiktionary dump] to create a file equivalent than [http://manpages.ubuntu.com/manpages/bionic/man5/french.5.html "/usr/share/dict/french"]. Demonstrate how your language can handle this dump which is a big bz2'ed XML file of about 800MB. The "/usr/share/dict/french" file contains one word of the French language by line in a text file. This file is available in Ubuntu with the package '''wfrench'''.
Use the [https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 wiktionary dump] to create a file equivalent than [http://manpages.ubuntu.com/manpages/bionic/man5/french.5.html "/usr/share/dict/french"]. Demonstrate how your language can handle this dump which is a big bz2'ed XML file of about 800MB. The "/usr/share/dict/french" file contains one word of the French language by line in a text file. This file is available in Ubuntu with the package '''wfrench'''.

=={{header|OCaml}}==

<lang ocaml>let () =
let i = Xmlm.make_input ~strip:true (`Channel stdin) in
let title = ref "" in
let tag_path = ref [] in
let push_tag tag =
tag_path := tag :: !tag_path
in
let pop_tag () =
match !tag_path with [] -> ()
| _ :: tl -> tag_path := tl
in
let last_tag_is tag =
match !tag_path with [] -> false
| hd :: _ -> hd = tag
in
while not (Xmlm.eoi i) do
match Xmlm.input i with
| `Dtd dtd -> ()
| `El_start ((uri, tag_name), attrs) -> push_tag tag_name
| `El_end -> pop_tag ()
| `Data s ->
if last_tag_is "title"
then title := s;
if last_tag_is "text"
then begin
let reg = Str.regexp_string "==French==" in
if Str.string_match reg s 0
then print_endline !title
end
done</lang>

Revision as of 08:55, 9 December 2020

Use the wiktionary dump to create a file equivalent than "/usr/share/dict/french". Demonstrate how your language can handle this dump which is a big bz2'ed XML file of about 800MB. The "/usr/share/dict/french" file contains one word of the French language by line in a text file. This file is available in Ubuntu with the package wfrench.

OCaml

<lang ocaml>let () =

 let i = Xmlm.make_input ~strip:true (`Channel stdin) in
 let title = ref "" in
 let tag_path = ref [] in
 let push_tag tag =
   tag_path := tag :: !tag_path
 in
 let pop_tag () =
   match !tag_path with [] -> ()
   | _ :: tl -> tag_path := tl
 in
 let last_tag_is tag =
   match !tag_path with [] -> false
   | hd :: _ -> hd = tag
 in
 while not (Xmlm.eoi i) do
   match Xmlm.input i with
   | `Dtd dtd -> ()
   | `El_start ((uri, tag_name), attrs) -> push_tag tag_name
   | `El_end -> pop_tag ()
   | `Data s ->
       if last_tag_is "title"
       then title := s;
       if last_tag_is "text"
       then begin
         let reg = Str.regexp_string "==French==" in
         if Str.string_match reg s 0
         then print_endline !title
       end
 done</lang>