WiktionaryDumps to words

From Rosetta Code
Revision as of 08:55, 9 December 2020 by rosettacode>Blue Prawn (Added OCaml)

Use the wiktionary dump to create a file equivalent than "/usr/share/dict/french". Demonstrate how your language can handle this dump which is a big bz2'ed XML file of about 800MB. The "/usr/share/dict/french" file contains one word of the French language by line in a text file. This file is available in Ubuntu with the package wfrench.

OCaml

<lang ocaml>let () =

 let i = Xmlm.make_input ~strip:true (`Channel stdin) in
 let title = ref "" in
 let tag_path = ref [] in
 let push_tag tag =
   tag_path := tag :: !tag_path
 in
 let pop_tag () =
   match !tag_path with [] -> ()
   | _ :: tl -> tag_path := tl
 in
 let last_tag_is tag =
   match !tag_path with [] -> false
   | hd :: _ -> hd = tag
 in
 while not (Xmlm.eoi i) do
   match Xmlm.input i with
   | `Dtd dtd -> ()
   | `El_start ((uri, tag_name), attrs) -> push_tag tag_name
   | `El_end -> pop_tag ()
   | `Data s ->
       if last_tag_is "title"
       then title := s;
       if last_tag_is "text"
       then begin
         let reg = Str.regexp_string "==French==" in
         if Str.string_match reg s 0
         then print_endline !title
       end
 done</lang>