Web scraping: Difference between revisions
Line 185: | Line 185: | ||
or more concisely: |
or more concisely: |
||
<lang lisp> |
<lang lisp>(use 'clojure.contrib.duck-streams) |
||
(use 'clojure.contrib.duck-streams) |
|||
(second (re-find #" (\d{1,2}:\d{1,2}:\d{1,2}) UTC" (slurp* "http://tycho.usno.navy.mil/cgi-bin/timer.pl"))) |
(second (re-find #" (\d{1,2}:\d{1,2}:\d{1,2}) UTC" (slurp* "http://tycho.usno.navy.mil/cgi-bin/timer.pl"))) |
||
</lang> |
</lang> |
Revision as of 19:19, 9 July 2010
You are encouraged to solve this task according to the task description, using any language you may know.
Create a program that downloads the time from this URL: http://tycho.usno.navy.mil/cgi-bin/timer.pl and then prints the current UTC time by extracting just the UTC time from the web page's HTML.
If possible, only use libraries that come at no extra monetary cost with the programming language and that are widely available and popular such as CPAN for Perl or Boost for C++.
AutoHotkey
<lang AutoHotkey>UrlDownloadToFile, http://tycho.usno.navy.mil/cgi-bin/timer.pl, time.html FileRead, timefile, time.html pos := InStr(timefile, "UTC") msgbox % time := SubStr(timefile, pos - 9, 8)</lang>
AWK
This is inspired by GETURL example in the manual for gawk.
<lang awk>#! /usr/bin/awk -f
BEGIN {
purl = "/inet/tcp/0/tycho.usno.navy.mil/80" ORS = RS = "\r\n\r\n" print "GET /cgi-bin/timer.pl HTTP/1.0" |& purl purl |& getline header while ( (purl |& getline ) > 0 ) { split($0, a, "\n") for(i=1; i <= length(a); i++) { if ( a[i] ~ /UTC/ ) { sub(/^
/, "", a[i]) printf "%s\n", a[i] } } } close(purl)
}</lang>
C
There's no any proper error handling.
<lang c>#include <stdio.h>
- include <string.h>
- include <curl/curl.h>
- include <sys/types.h>
- include <regex.h>
- define BUFSIZE 16384
size_t lr = 0;
size_t filterit(void *ptr, size_t size, size_t nmemb, void *stream) {
if ( (lr + size*nmemb) > BUFSIZE ) return BUFSIZE; memcpy(stream+lr, ptr, size*nmemb); lr += size*nmemb; return size*nmemb;
}
int main() {
CURL *curlHandle; char buffer[BUFSIZE]; regmatch_t amatch; regex_t cregex;
curlHandle = curl_easy_init(); curl_easy_setopt(curlHandle, CURLOPT_URL, "http://tycho.usno.navy.mil/cgi-bin/timer.pl"); curl_easy_setopt(curlHandle, CURLOPT_FOLLOWLOCATION, 1); curl_easy_setopt(curlHandle, CURLOPT_WRITEFUNCTION, filterit); curl_easy_setopt(curlHandle, CURLOPT_WRITEDATA, buffer); int success = curl_easy_perform(curlHandle); curl_easy_cleanup(curlHandle);
buffer[lr] = 0; regcomp(&cregex, " UTC", REG_NEWLINE); regexec(&cregex, buffer, 1, &amatch, 0); int bi = amatch.rm_so; while ( bi-- > 0 ) if ( memcmp(&buffer[bi], "
", 4) == 0 ) break;
buffer[amatch.rm_eo] = 0;
printf("%s\n", &buffer[bi+4]);
regfree(&cregex); return 0;
}</lang>
C++
Library: Boost
to be compiled under Linux with g++ -lboost_regex -lboost_system -lboost_thread
<lang cpp>#include <iostream>
- include <istream>
- include <ostream>
- include <string>
- include <boost/asio.hpp>
- include <boost/regex.hpp>
- include <sstream>
int main( ) {
try { boost::asio::io_service io_service ; boost::asio::ip::tcp::resolver resolver ( io_service ) ; //we now try to get a list of endpoints to the server boost::asio::ip::tcp::resolver::query query( "tycho.usno.navy.mil" ,
"http" ) ;
boost::asio::ip::tcp::resolver::iterator endpoint_iterator =
resolver.resolve( query ) ;
boost::asio::ip::tcp::resolver::iterator end ; //looking for a successful endpoint connection boost::asio::ip::tcp::socket socket ( io_service ) ; boost::system::error_code error =
boost::asio::error::host_not_found ;
while ( error && endpoint_iterator != end ) {
socket.close( ) ; socket.connect ( *endpoint_iterator++ , error ) ;
} if ( error )
throw boost::system::system_error ( error ) ;
boost::asio::streambuf request ; //we now write the request std::ostream request_stream( &request ) ; request_stream << "GET /cgi-bin/timer.pl HTTP/1.0\r\n" ; request_stream << "Host: " << "tycho.usno.navy.mil" << "\r\n" ; request_stream << "Accept: */*\r\n" ; request_stream << "Connection: close\r\n\r\n" ; boost::asio::write ( socket , request ) ; //we write the request boost::asio::streambuf response ; boost::asio::read_until( socket , response , "\r\n\r\n" ) ; std::ostringstream line ; //reading to end , disregarding possible error messages boost::regex e ( "
(.+\\s+UTC)" ) ; while ( boost::asio::read( socket , response,
boost::asio::transfer_at_least( 1 ) , error )) { line << &response ; boost::smatch matches ; if ( boost::regex_search( line.str( ) , matches, e ) ) std::cout << matches[ 1 ] << std::endl ; line.str( "" ) ;
} if ( error != boost::asio::error::eof )
throw boost::system::system_error( error ) ;
} catch ( std::exception & ex ) {
std::cout << "Exception: " << ex.what( ) << "\n" ;
} return 0 ;
}</lang>
C#
<lang csharp>class Program
{ static void Main(string[] args) { WebClient wc = new WebClient(); Stream myStream = wc.OpenRead("http://tycho.usno.navy.mil/cgi-bin/timer.pl"); string html = ""; using (StreamReader sr = new StreamReader(myStream)) { while (sr.Peek() >= 0) { html = sr.ReadLine(); if (html.Contains("UTC")) { break; } } } Console.WriteLine(html.Remove(0, 4));
Console.ReadLine(); } }
</lang>
Clojure
<lang lisp>(use 'clojure.contrib.duck-streams) (use 'clojure.contrib.str-utils) (let [url "http://tycho.usno.navy.mil/cgi-bin/timer.pl"]
(first (filter #(re-find #"\d{1,2}:\d{1,2}:\d{1,2}" %) (re-split #" " (first (filter #(re-find #"UTC" %) (re-split #"\n" (slurp* url))))))))</lang>
or more concisely:
<lang lisp>(use 'clojure.contrib.duck-streams) (second (re-find #" (\d{1,2}:\d{1,2}:\d{1,2}) UTC" (slurp* "http://tycho.usno.navy.mil/cgi-bin/timer.pl"))) </lang>
or in upcoming Clojure 1.2 in a one-liner:
<lang lisp>(second (re-find #" (\d{1,2}:\d{1,2}:\d{1,2}) UTC" (slurp "http://tycho.usno.navy.mil/cgi-bin/timer.pl")))</lang>
Common Lisp
<lang lisp>BOA> (let* ((url "http://tycho.usno.navy.mil/cgi-bin/timer.pl")
(regexp (load-time-value (cl-ppcre:create-scanner "(?m)^.{4}(.+? UTC)"))) (data (drakma:http-request url))) (multiple-value-bind (start end start-regs end-regs) (cl-ppcre:scan regexp data) (declare (ignore end)) (when start (subseq data (aref start-regs 0) (aref end-regs 0)))))
"Aug. 12, 04:29:51 UTC"</lang>
D
<lang d>import std.string: find; import std.regexp: RegExp, search; import std.socket;
void main() {
InternetAddress addr = new InternetAddress("tycho.usno.navy.mil", 80); TcpSocket sock = new TcpSocket(addr);
sock.send("GET /cgi-bin/timer.pl HTTP/1.0\r\n"); sock.send("Host: tycho.usno.navy.mil\r\n"); sock.send("Connection: close\r\n"); sock.send("\r\n");
char[1024] buffer; char[] msg; uint buflen = 1024; while(buflen == 1024) { buflen = sock.receive(buffer); msg ~= buffer[0..buflen]; }
sock.shutdown(SocketShutdown.BOTH); sock.close();
int constart = find(msg, "\r\n\r\n") + 4; // we assume this went OK msg = msg[constart..$];
RegExp match = search(msg, `
([A-Z0-9:., ]+ UTC)`, `i`); // `...` is the same as r"..." (means that \ does not escape) printf("Time: %.*s\n", match.match(1));
}</lang>
E
<lang e>interp.waitAtTop(when (def html := <http://tycho.usno.navy.mil/cgi-bin/timer.pl>.getText()) -> {
def rx`(?s).*>(@time.*? UTC).*` := html println(time)
})</lang>
Erlang
Using regular expressions:
<lang erlang>-module(scraping).
-export([main/0]).
-define(Url, "http://tycho.usno.navy.mil/cgi-bin/timer.pl").
-define(Match, "
(.+ UTC)").
main() -> inets:start(), {ok, {_Status, _Header, HTML}} = http:request(?Url), {match, [Time]} = re:run(HTML, ?Match, [{capture, all_but_first, binary}]), io:format("~s~n",[Time]).</lang>
F#
This code is asynchronous - it will not block any threads while it waits on a response from the remote server. <lang fsharp> open System open System.Net open System.Text.RegularExpressions
async {
use wc = new WebClient() let! html = wc.AsyncDownloadString(Uri("http://tycho.usno.navy.mil/cgi-bin/timer.pl")) return Regex.Match(html, @"
(.+ UTC)").Groups.[1].Value
} |> Async.RunSynchronously |> printfn "%s" </lang>
Factor
<lang factor>USING: http.get io sequences ;
"http://tycho.usno.navy.mil/cgi-bin/timer.pl" http-get nip [ "UTC" swap start [ 9 - ] [ 1 - ] bi ] keep subseq print</lang>
Forth
<lang forth>include unix/socket.fs
- extract-time ( addr len type len -- time len )
dup >r search 0= abort" that time not present!" dup >r begin -1 /string over 1- c@ [char] > = until \ seek back to
at start of line r> - r> + ;
s" tycho.usno.navy.mil" 80 open-socket dup s\" GET /cgi-bin/timer.pl HTTP/1.0\n\n" rot write-socket dup pad 4096 read-socket s\" \r\n\r\n" search 0= abort" can't find headers!" \ skip headers s" UTC" extract-time type cr close-socket</lang>
Haskell
Using package HTTP-4000.0.8 from HackgageDB <lang Haskell>import Data.List import Network.HTTP (simpleHTTP, getResponseBody, getRequest)
tyd = "http://tycho.usno.navy.mil/cgi-bin/timer.pl"
readUTC = simpleHTTP (getRequest tyd)>>=
fmap ((!!2).head.dropWhile ("UTC"`notElem`).map words.lines). getResponseBody>>=putStrLn</lang>
Usage in GHCi: <lang Haskell>*Main> readUTC 08:30:23</lang>
J
<lang j> require 'web/gethttp'
_8{. ' UTC' taketo gethttp 'http://tycho.usno.navy.mil/cgi-bin/timer.pl'
04:32:44</lang>
The web/gethttp
addon uses Wget on Linux or Windows (J ships with Wget on Windows) and cURL on the Mac. A sockets solution is also possible.
Java
<lang java>import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection;
public class WebTime{
public static void main(String[] args){
try{
URL address = new URL(
"http://tycho.usno.navy.mil/cgi-bin/timer.pl");
URLConnection conn = address.openConnection();
BufferedReader in = new BufferedReader(
new InputStreamReader(conn.getInputStream()));
String line;
while(!(line = in.readLine()).contains("UTC"));
System.out.println(line.substring(4));
}catch(IOException e){
System.err.println("error connecting to server.");
e.printStackTrace();
}
}
}</lang>
OCaml
<lang ocaml>let () =
let _,_, page_content = make_request ~url:Sys.argv.(1) ~kind:GET () in
let lines = Str.split (Str.regexp "\n") page_content in let str = List.find (fun line -> try ignore(Str.search_forward (Str.regexp "UTC") line 0); true with Not_found -> false) lines in let str = Str.global_replace (Str.regexp "
") "" str in print_endline str;
- </lang>
There are libraries for this, but it's rather interesting to see how to use a socket to achieve this, so see the implementation of the above function make_request on this page.
Oz
<lang oz>declare
[Regex] = {Module.link ['x-oz://contrib/regex']}
fun {GetPage Url} F = {New Open.file init(url:Url)} Contents = {F read(list:$ size:all)} in {F close} Contents end
fun {GetDateString Doc} case {Regex.search "
([A-Za-z0-9:., ]+ UTC)" Doc} of match(1:S#E ...) then {List.take {List.drop Doc S} E-S+1} end end
Url = "http://tycho.usno.navy.mil/cgi-bin/timer.pl"
in
{System.showInfo {GetDateString {GetPage Url}}}</lang>
Perl
<lang perl>use LWP::Simple;
my $url = 'http://tycho.usno.navy.mil/cgi-bin/timer.pl';
get($url) =~ /
(.+? UTC)/
and print "$1\n";</lang>
PHP
By iterating over each line:
<lang PHP><?
$contents = file('http://tycho.usno.navy.mil/cgi-bin/timer.pl'); foreach ($contents as $line){
if (($pos = strpos($line, ' UTC')) === false) continue; echo subStr($line, 4, $pos - 4); //Prints something like "Dec. 06, 16:18:03" break;
}</lang>
By regular expressions (
):
<lang PHP><?
echo preg_replace(
"/^.*
(.*) UTC.*$/su",
"\\1",
file_get_contents('http://tycho.usno.navy.mil/cgi-bin/timer.pl')
);
</lang>
PicoLisp
<lang PicoLisp>(load "@lib/http.l")
(client "tycho.usno.navy.mil" 80 "cgi-bin/timer.pl"
(when (from "
") (pack (trim (till "U"))) ) )</lang>
Output:
-> "Feb. 19, 18:11:37"
PowerShell
<lang powershell>$wc = New-Object Net.WebClient $html = $wc.DownloadString('http://tycho.usno.navy.mil/cgi-bin/timer.pl') $html -match ', (.*) UTC' | Out-Null Write-Host $Matches[1]</lang>
PureBasic
<lang Purebasic>URLDownloadToFile_( #Null, "http://tycho.usno.navy.mil/cgi-bin/timer.pl", "timer.htm", 0, #Null) ReadFile(0, "timer.htm") While Not Eof(0) : Text$ + ReadString(0) : Wend MessageRequester("Time", Mid(Text$, FindString(Text$, "UTC", 1) - 9 , 8))</lang>
Python
<lang python>import urllib page = urllib.urlopen('http://tycho.usno.navy.mil/cgi-bin/timer.pl') for line in page:
if ' UTC' in line: print line.strip()[4:] break
page.close()</lang> Sample output:
Aug. 12, 15:22:08 UTC Universal Time
R
First, retrieve the web page. See HTTP_Request for more options with this. <lang R>library(RCurl) webpage <- getURL("http://tycho.usno.navy.mil/cgi-bin/timer.pl")</lang> Now parse the html code into a tree and retrieve the interesting bit <lang R>library(XML) pagetree <- htmlTreeParse(webpage ) timesnode <- pagetree$children$html$children$body$children$h3$children$pre$children timesnode <- timesnode[names(timesnode)=="text"]</lang> Finally, find the line with universal time and parse it <lang R>timestrings <- sapply(timesnode, function(x) x$value) index <- grep("Universal Time", timestrings) utctimestr <- strsplit(timestrings[index], "\t")$text[1] utctime <- strptime(utctimestr, "%b. %d, %H:%M:%S UTC")
- Print the date in any format you desire.
strftime(utctime, "%A, %d %B %Y, %H:%M:%S")</lang>
Monday, 03 August 2009, 16:15:37
REBOL
<lang REBOL>REBOL [ Title: "Web Scraping" Author: oofoe Date: 2009-12-07 URL: http://rosettacode.org/wiki/Web_Scraping ]
- Notice that REBOL understands unquoted URL's
service: http://tycho.usno.navy.mil/cgi-bin/timer.pl
- The 'read' function can read from any data scheme that REBOL knows
- about, which includes web URLs. NOTE
- Depending on your security
- settings, REBOL may ask you for permission to contact the service.
html: read service
- I parse the HTML to find the first
(note the unquoted HTML tag - -- REBOL understands those too), then copy the current time from
- there to the "UTC" terminator.
- I have the "to end" in the parse rule so the parse will succeed.
- Not strictly necessary once I've got the time, but good practice.
parse html [thru
copy current thru "UTC" to end]
print ["Current UTC time:" current]</lang>
Ruby
A verbose example for comparison
<lang ruby> require "open-uri"
open('http://tycho.usno.navy.mil/cgi-bin/timer.pl') do |p|
p.each_line do |line| if line =~ /UTC/ puts line.match(/ (\d{1,2}:\d{1,2}:\d{1,2}) /) break end end
end </lang>
A more concise example
<lang ruby> require 'open-uri' puts URI.parse('http://tycho.usno.navy.mil/cgi-bin/timer.pl').read.match(/ (\d{1,2}:\d{1,2}:\d{1,2}) UTC/)[1] </lang>
Tcl
<lang tcl>package require http
set request [http::geturl "http://tycho.usno.navy.mil/cgi-bin/timer.pl"]
if {[regexp -line {
(.* UTC)} [http::data $request] --> utc]} {
puts $utc
}</lang>
UNIX Shell
This solution uses curl, which can be downloaded for free, and the popular (at least in the unix world) utilities programs grep and sed.
<lang bash>#!/bin/sh curl -s http://tycho.usno.navy.mil/cgi-bin/timer.pl |
grep ' UTC' | sed -e 's/^
//;s/ UTC.*$//'</lang>
Ursala
This works by launching the wget command in a separate process and capturing its output. The program is compiled to an executable command. <lang Ursala>#import std
- import cli
- executable ('parameterized',)
whatime =
<.file$[contents: --<>]>+ -+
@hm skip/*4+ ~=(9%cOi&)-~l*+ *~ ~&K3/'UTC', (ask bash)/0+ -[wget -O - http://tycho.usno.navy.mil/cgi-bin/timer.pl]-!+-</lang>
Here is a bash session.
$ whatime Jun. 26, 20:49:52 UTC
Visual Basic .NET
New, .NET way with StringReader: <lang vbnet>Imports System.Net Imports System.IO
Dim client As WebClient = New WebClient() Dim content As String = client.DownloadString("http://tycho.usno.navy.mil/cgi-bin/timer.pl") Dim sr As New StringReader(content) While sr.peek <> -1 Dim s As String = sr.ReadLine If s.Contains("UTC") Then Dim time As String() = s.Substring(4).Split(vbTab) Console.WriteLine(time(0)) End If End While</lang>
Alternative, old fashioned way using VB "Split" function: <lang vbnet>Imports System.Net
Dim client As WebClient = New WebClient() Dim content As String = client.DownloadString("http://tycho.usno.navy.mil/cgi-bin/timer.pl") Dim lines() As String = Split(content, vbLf) 'may need vbCrLf For Each line In lines If line.Contains("UTC") Then Dim time As String() = line.Substring(4).Split(vbTab) Console.WriteLine(time(0)) End If Next</lang>
- Programming Tasks
- Networking and Web Interaction
- AutoHotkey
- AWK
- C
- Libcurl
- C++
- C sharp
- Clojure
- Common Lisp
- Cl-ppcre
- DRAKMA
- D
- E
- Erlang
- F Sharp
- Factor
- Forth
- Haskell
- J
- Java
- OCaml
- Oz
- Perl
- PHP
- PicoLisp
- PowerShell
- PureBasic
- Python
- R
- RCurl
- XML
- REBOL
- Ruby
- Tcl
- UNIX Shell
- Ursala
- TI-83 BASIC/Omit
- TI-89 BASIC/Omit
- Visual Basic .NET
- Input Output
- Batch File/Omit
- M4/Omit