WiktionaryDumps to words: Difference between revisions
Content added Content deleted
(Draft Task) |
(Added OCaml) |
||
Line 1: | Line 1: | ||
Use the [https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 wiktionary dump] to create a file equivalent than [http://manpages.ubuntu.com/manpages/bionic/man5/french.5.html "/usr/share/dict/french"]. Demonstrate how your language can handle this dump which is a big bz2'ed XML file of about 800MB. The "/usr/share/dict/french" file contains one word of the French language by line in a text file. This file is available in Ubuntu with the package '''wfrench'''. |
Use the [https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 wiktionary dump] to create a file equivalent than [http://manpages.ubuntu.com/manpages/bionic/man5/french.5.html "/usr/share/dict/french"]. Demonstrate how your language can handle this dump which is a big bz2'ed XML file of about 800MB. The "/usr/share/dict/french" file contains one word of the French language by line in a text file. This file is available in Ubuntu with the package '''wfrench'''. |
||
=={{header|OCaml}}== |
|||
<lang ocaml>let () = |
|||
let i = Xmlm.make_input ~strip:true (`Channel stdin) in |
|||
let title = ref "" in |
|||
let tag_path = ref [] in |
|||
let push_tag tag = |
|||
tag_path := tag :: !tag_path |
|||
in |
|||
let pop_tag () = |
|||
match !tag_path with [] -> () |
|||
| _ :: tl -> tag_path := tl |
|||
in |
|||
let last_tag_is tag = |
|||
match !tag_path with [] -> false |
|||
| hd :: _ -> hd = tag |
|||
in |
|||
while not (Xmlm.eoi i) do |
|||
match Xmlm.input i with |
|||
| `Dtd dtd -> () |
|||
| `El_start ((uri, tag_name), attrs) -> push_tag tag_name |
|||
| `El_end -> pop_tag () |
|||
| `Data s -> |
|||
if last_tag_is "title" |
|||
then title := s; |
|||
if last_tag_is "text" |
|||
then begin |
|||
let reg = Str.regexp_string "==French==" in |
|||
if Str.string_match reg s 0 |
|||
then print_endline !title |
|||
end |
|||
done</lang> |
Revision as of 08:55, 9 December 2020
Use the wiktionary dump to create a file equivalent than "/usr/share/dict/french". Demonstrate how your language can handle this dump which is a big bz2'ed XML file of about 800MB. The "/usr/share/dict/french" file contains one word of the French language by line in a text file. This file is available in Ubuntu with the package wfrench.
OCaml
<lang ocaml>let () =
let i = Xmlm.make_input ~strip:true (`Channel stdin) in let title = ref "" in let tag_path = ref [] in let push_tag tag = tag_path := tag :: !tag_path in let pop_tag () = match !tag_path with [] -> () | _ :: tl -> tag_path := tl in let last_tag_is tag = match !tag_path with [] -> false | hd :: _ -> hd = tag in while not (Xmlm.eoi i) do match Xmlm.input i with | `Dtd dtd -> () | `El_start ((uri, tag_name), attrs) -> push_tag tag_name | `El_end -> pop_tag () | `Data s -> if last_tag_is "title" then title := s; if last_tag_is "text" then begin let reg = Str.regexp_string "==French==" in if Str.string_match reg s 0 then print_endline !title end done</lang>