WiktionaryDumps to words

Revision as of 10:02, 9 December 2020 by rosettacode>Paddy3118 (Too vague)
NOTE
Please wait for the issues of the discussion page on the vagueness of the task to be addressed before adding to this task.

Use the wiktionary dump to create a file equivalent than "/usr/share/dict/french". Demonstrate how your language can handle this dump which is a big bz2'ed XML file of about 800MB. The "/usr/share/dict/french" file contains one word of the French language by line in a text file. This file is available in Ubuntu with the package wfrench.

OCaml

<lang ocaml>let () =

 let i = Xmlm.make_input ~strip:true (`Channel stdin) in
 let title = ref "" in
 let tag_path = ref [] in
 let push_tag tag =
   tag_path := tag :: !tag_path
 in
 let pop_tag () =
   match !tag_path with [] -> ()
   | _ :: tl -> tag_path := tl
 in
 let last_tag_is tag =
   match !tag_path with [] -> false
   | hd :: _ -> hd = tag
 in
 while not (Xmlm.eoi i) do
   match Xmlm.input i with
   | `Dtd dtd -> ()
   | `El_start ((uri, tag_name), attrs) -> push_tag tag_name
   | `El_end -> pop_tag ()
   | `Data s ->
       if last_tag_is "title"
       then title := s;
       if last_tag_is "text"
       then begin
         let reg = Str.regexp_string "==French==" in
         if Str.string_match reg s 0
         then print_endline !title
       end
 done</lang>
Output:
wget --quiet https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 -O - | bzcat | \
  ocaml str.cma -I $(ocamlfind query xmlm) xmlm.cma to_words.ml
livrer
observateur
qui a bu boira
quelque chose
grande parure
obiit
pleuvoir
voir
...