Web scraping: Difference between revisions

Content added Content deleted

Inline

Revision as of 01:27, 27 November 2010

Create a program that downloads the time from this URL: http://tycho.usno.navy.mil/cgi-bin/timer.pl and then prints the current UTC time by extracting just the UTC time from the web page's HTML.

If possible, only use libraries that come at no extra monetary cost with the programming language and that are widely available and popular such as CPAN for Perl or Boost for C++.

AutoHotkey

<lang AutoHotkey>UrlDownloadToFile, http://tycho.usno.navy.mil/cgi-bin/timer.pl, time.html FileRead, timefile, time.html pos := InStr(timefile, "UTC") msgbox % time := SubStr(timefile, pos - 9, 8)</lang>

AWK

This is inspired by GETURL example in the manual for gawk.

<lang awk>#! /usr/bin/awk -f

BEGIN {

purl = "/inet/tcp/0/tycho.usno.navy.mil/80" ORS = RS = "\r\n\r\n" print "GET /cgi-bin/timer.pl HTTP/1.0" |& purl purl |& getline header while ( (purl |& getline ) > 0 ) { split($0, a, "\n") for(i=1; i <= length(a); i++) { if ( a[i] ~ /UTC/ ) { sub(/^ /, "", a[i]) printf "%s\n", a[i] } } } close(purl)

}</lang>

ALGOL 68

Works with: ALGOL 68 version Revision 1 - however grep in string, http content and str error are from a non-standard library

Works with: ALGOL 68G version Any - tested with release 1.18.0-9h.tiny

<lang algol68>STRING

  domain="tycho.usno.navy.mil",
  page="cgi-bin/timer.pl";

STRING # search for the needle in the haystack #

  needle = "UTC",
  hay stack = "http://"+domain+"/"+page,

  re success="^HTTP/[0-9.]* 200",
  re result description="^HTTP/[0-9.]* [0-9]+ [a-zA-Z ]*",
  re doctype ="\s\s<![Dd][Oo][Cc][Tt][Yy][Pp][Ee] [^>]+>\s+";

PROC raise error = (STRING msg)VOID: ( put(stand error, (msg, new line)); stop);

PROC is html page = (REF STRING page) BOOL: (

    BOOL out=grep in string(re success, page, NIL, NIL) = 0;
    IF INT start, end;
       grep in string(re result description, page, start, end) = 0
    THEN
       page:=page[end+1:];
       IF grep in string(re doctype, page, start, end) = 0
       THEN page:=page[start+2:]
       ELSE raise error("unknown format retrieving page")
       FI
    ELSE raise error("unknown error retrieving page")
    FI;
    out

);

STRING reply; INT rc = http content (reply, domain, haystack, 0); IF rc = 0 AND is html page (reply) THEN

 STRING line; FILE freply; associate(freply, reply);
 on logical file end(freply, (REF FILE freply)BOOL: (done; SKIP));
 DO
   get(freply,(line, new line));
   IF string in string(needle, NIL, line) THEN print((line, new line)) FI
 OD;
 done: SKIP

ELSE raise error (strerror (rc)) FI</lang> Sample output:

<BR>Sep. 26, 21:51:17 UTC               Universal Time

C

Works with: POSIX version .1-2001

Library: libcurl

There's no any proper error handling.

<lang c>#include <stdio.h>

include <string.h>
include <curl/curl.h>
include <sys/types.h>
include <regex.h>

define BUFSIZE 16384

size_t lr = 0;

size_t filterit(void *ptr, size_t size, size_t nmemb, void *stream) {

 if ( (lr + size*nmemb) > BUFSIZE ) return BUFSIZE;
 memcpy(stream+lr, ptr, size*nmemb);
 lr += size*nmemb;
 return size*nmemb;

}

int main() {

 CURL *curlHandle;
 char buffer[BUFSIZE];
 regmatch_t amatch;
 regex_t cregex;

 curlHandle = curl_easy_init();
 curl_easy_setopt(curlHandle, CURLOPT_URL, "http://tycho.usno.navy.mil/cgi-bin/timer.pl");
 curl_easy_setopt(curlHandle, CURLOPT_FOLLOWLOCATION, 1);
 curl_easy_setopt(curlHandle, CURLOPT_WRITEFUNCTION, filterit);
 curl_easy_setopt(curlHandle, CURLOPT_WRITEDATA, buffer);
 int success = curl_easy_perform(curlHandle);
 curl_easy_cleanup(curlHandle);

 buffer[lr] = 0;
 
 regcomp(&cregex, " UTC", REG_NEWLINE);
 regexec(&cregex, buffer, 1, &amatch, 0);
 int bi = amatch.rm_so;
 while ( bi-- > 0 )
   if ( memcmp(&buffer[bi], "
", 4) == 0 ) break;

 buffer[amatch.rm_eo] = 0;

 printf("%s\n", &buffer[bi+4]);

 regfree(&cregex);
 return 0;

}</lang>

C++

Library: Boost
to be compiled under Linux with g++ -lboost_regex -lboost_system -lboost_thread <lang cpp>#include <iostream>

include <istream>
include <ostream>
include <string>
include <boost/asio.hpp>
include <boost/regex.hpp>
include <sstream>

int main( ) {

  try {
     boost::asio::io_service io_service ;
     boost::asio::ip::tcp::resolver resolver ( io_service ) ;
     //we now try to get a list of endpoints to the server 
     boost::asio::ip::tcp::resolver::query query( "tycho.usno.navy.mil" ,

"http" ) ;

     boost::asio::ip::tcp::resolver::iterator endpoint_iterator =

resolver.resolve( query ) ;

     boost::asio::ip::tcp::resolver::iterator end ;
     //looking for a successful endpoint connection
     boost::asio::ip::tcp::socket socket ( io_service ) ;
     boost::system::error_code error =

boost::asio::error::host_not_found ;

     while ( error && endpoint_iterator != end ) {

socket.close( ) ; socket.connect ( *endpoint_iterator++ , error ) ;

     }
     if ( error )

throw boost::system::system_error ( error ) ;

     boost::asio::streambuf request ; //we now write the request
     std::ostream request_stream( &request ) ;
     request_stream << "GET /cgi-bin/timer.pl HTTP/1.0\r\n"  ;
     request_stream << "Host: " << "tycho.usno.navy.mil" << "\r\n" ;
     request_stream << "Accept: */*\r\n" ; 
     request_stream << "Connection: close\r\n\r\n" ;
     boost::asio::write ( socket , request ) ;  //we write the request
     boost::asio::streambuf response ;
     boost::asio::read_until( socket , response , "\r\n\r\n" ) ;
     std::ostringstream line ;
     //reading to end , disregarding possible error messages
     boost::regex e ( "
(.+\\s+UTC)" ) ;
     while ( boost::asio::read( socket , response,

boost::asio::transfer_at_least( 1 ) , error )) { line << &response ; boost::smatch matches ; if ( boost::regex_search( line.str( ) , matches, e ) ) std::cout << matches[ 1 ] << std::endl ; line.str( "" ) ;

     }
     if ( error != boost::asio::error::eof )

throw boost::system::system_error( error ) ;

     } catch ( std::exception & ex ) {

std::cout << "Exception: " << ex.what( ) << "\n" ;

     }
  return 0 ;

}</lang>

C#

<lang csharp>class Program

   {
       static void Main(string[] args)
       {
           WebClient wc = new WebClient();
           Stream myStream = wc.OpenRead("http://tycho.usno.navy.mil/cgi-bin/timer.pl");
           string html = "";
           using (StreamReader sr = new StreamReader(myStream))
           {
               while (sr.Peek() >= 0)
               {
                   html = sr.ReadLine();
                   if (html.Contains("UTC"))
                   {
                       break;
                   }
               }
               
           }
           Console.WriteLine(html.Remove(0, 4));

           Console.ReadLine();
       }
   }

</lang>

Clojure

Clojure 1.2:

<lang clojure> (second (re-find #" (\d{1,2}:\d{1,2}:\d{1,2}) UTC" (slurp "http://tycho.usno.navy.mil/cgi-bin/timer.pl"))) </lang>

Common Lisp

Library: cl-ppcre

Library: DRAKMA

<lang lisp>BOA> (let* ((url "http://tycho.usno.navy.mil/cgi-bin/timer.pl")

           (regexp (load-time-value
                    (cl-ppcre:create-scanner "(?m)^.{4}(.+? UTC)")))
           (data (drakma:http-request url)))
      (multiple-value-bind (start end start-regs end-regs)
          (cl-ppcre:scan regexp data)
        (declare (ignore end))
        (when start
          (subseq data (aref start-regs 0) (aref end-regs 0)))))

"Aug. 12, 04:29:51 UTC"</lang>

D

<lang d>import std.stdio, std.string, std.socket;

string loadPage(string url, string host) {

   InternetAddress addr = new InternetAddress(url, 80);
   TcpSocket sock = new TcpSocket(addr);

   sock.send("GET " ~ host ~ r" HTTP/1.0\r\n");
   sock.send("Host: " ~ host ~ r"\r\n");
   sock.send("Connection: close\r\n");
   sock.send("\r\n");

   char[1024] buffer;
   string msg;
   uint buflen = 1024;
   while (buflen == 1024) {
       buflen = sock.receive(buffer);
       msg ~= buffer[0 .. buflen];
   }

   sock.shutdown(SocketShutdown.BOTH);
   sock.close();
   return msg;

}

void main() {

   string page = loadPage(r"tycho.usno.navy.mil", r"/cgi-bin/timer.pl");
   foreach (line; page.splitlines())
       if (line.length > 4 && indexOf(line, " UTC") != -1)
           writeln(line[4 .. $]);

}</lang>

E

<lang e>interp.waitAtTop(when (def html := <http://tycho.usno.navy.mil/cgi-bin/timer.pl>.getText()) -> {

   def rx`(?s).*>(@time.*? UTC).*` := html
   println(time)

})</lang>

Erlang

Using regular expressions: <lang erlang>-module(scraping). -export([main/0]). -define(Url, "http://tycho.usno.navy.mil/cgi-bin/timer.pl"). -define(Match, "
(.+ UTC)").

main() -> inets:start(), {ok, {_Status, _Header, HTML}} = http:request(?Url), {match, [Time]} = re:run(HTML, ?Match, [{capture, all_but_first, binary}]), io:format("~s~n",[Time]).</lang>

F#

This code is asynchronous - it will not block any threads while it waits on a response from the remote server. <lang fsharp> open System open System.Net open System.Text.RegularExpressions

async {

   use wc = new WebClient()
   let! html = wc.AsyncDownloadString(Uri("http://tycho.usno.navy.mil/cgi-bin/timer.pl"))
   return Regex.Match(html, @"
(.+ UTC)").Groups.[1].Value

} |> Async.RunSynchronously |> printfn "%s" </lang>

Factor

<lang factor>USING: http.get io sequences ;

"http://tycho.usno.navy.mil/cgi-bin/timer.pl" http-get nip [ "UTC" swap start [ 9 - ] [ 1 - ] bi ] keep subseq print</lang>

Forth

Works with: GNU Forth version 0.7.0

<lang forth>include unix/socket.fs

extract-time ( addr len type len -- time len )

 dup >r
 search 0= abort" that time not present!"
 dup >r
 begin -1 /string  over 1- c@ [char] > = until       \ seek back to 
 at start of line
 r> - r> + ;

s" tycho.usno.navy.mil" 80 open-socket dup s\" GET /cgi-bin/timer.pl HTTP/1.0\n\n" rot write-socket dup pad 4096 read-socket s\" \r\n\r\n" search 0= abort" can't find headers!" \ skip headers s" UTC" extract-time type cr close-socket</lang>

Haskell

Using package HTTP-4000.0.8 from HackgageDB <lang Haskell>import Data.List import Network.HTTP (simpleHTTP, getResponseBody, getRequest)

tyd = "http://tycho.usno.navy.mil/cgi-bin/timer.pl"

readUTC = simpleHTTP (getRequest tyd)>>=

           fmap ((!!2).head.dropWhile ("UTC"`notElem`).map words.lines). getResponseBody>>=putStrLn</lang>

Usage in GHCi: <lang Haskell>*Main> readUTC 08:30:23</lang>

Icon and Unicon

Icon

Icon has capability to read web pages using the external function cfunc. The Unicon messaging extensions are more succinct.

Unicon

<lang Unicon>procedure main() m := open(url := "http://tycho.usno.navy.mil/cgi-bin/timer.pl","m") | stop("Unable to open ",url) every (p := "") ||:= |read(m) # read the page into a single string close(m)

map(p) ? ( tab(find("
")), ="
", write("UTC time=",p[&pos:find(" utc")])) # scrape and show end</lang>

J

<lang j> require 'web/gethttp'

  _8{. ' UTC' taketo gethttp 'http://tycho.usno.navy.mil/cgi-bin/timer.pl'

04:32:44</lang>

The web/gethttp addon uses Wget on Linux or Windows (J ships with Wget on Windows) and cURL on the Mac.

(A sockets solution is also possible. But, while basic HTTP support is trivial to implement, a full standards compliant implementation and can involve a lot of code to deal with rare corner cases, and the time required to complete a web request is often significantly longer than the time to invoke an external program. This would imply a fair bit of maintenance and testing overhead to deal with issues which rarely matter, if a direct sockets implementation were used.)

Java

<lang java>import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection;

public class WebTime{ public static void main(String[] args){ try{ URL address = new URL( "http://tycho.usno.navy.mil/cgi-bin/timer.pl"); URLConnection conn = address.openConnection(); BufferedReader in = new BufferedReader( new InputStreamReader(conn.getInputStream())); String line; while(!(line = in.readLine()).contains("UTC")); System.out.println(line.substring(4)); }catch(IOException e){ System.err.println("error connecting to server."); e.printStackTrace(); } } }</lang>

Mathematica

<lang mathematica> test = StringSplit[Import["http://tycho.usno.navy.mil/cgi-bin/timer.pl"], "\n"]; Extract[test, Flatten@Position[StringFreeQ[test, "UTC"], False]] </lang>

Objeck

<lang objeck> use Net; use IO; use Structure;

bundle Default {

 class Scrape {
   function : Main(args : String[]) ~ Nil {
     client := HttpClient->New("http://tycho.usno.navy.mil/cgi-bin/timer.pl", 80);
     lines := client->Get();
     
      i := 0; 
      found := false;
     while(found <> true & i < lines->Size()) {
        line := lines->Get(i)->As(String);
       index := line->Find("UTC");
       if(index > -1) {
         time := line->SubString(index - 9, 9)->Trim();
         time->PrintLine();
         found := true;
       };
       i += 1;
     };
   }
 }

} </lang>

OCaml

<lang ocaml>let () =

 let _,_, page_content = make_request ~url:Sys.argv.(1) ~kind:GET () in

 let lines = Str.split (Str.regexp "\n") page_content in
 let str =
   List.find
     (fun line ->
       try ignore(Str.search_forward (Str.regexp "UTC") line 0); true
       with Not_found -> false)
     lines
 in
 let str = Str.global_replace (Str.regexp "
") "" str in
 print_endline str;

</lang>

There are libraries for this, but it's rather interesting to see how to use a socket to achieve this, so see the implementation of the above function make_request on this page.

Oz

<lang oz>declare

 [Regex] = {Module.link ['x-oz://contrib/regex']}

 fun {GetPage Url}
    F = {New Open.file init(url:Url)}
    Contents = {F read(list:$ size:all)}
 in
    {F close}
    Contents
 end

 fun {GetDateString Doc}
    case {Regex.search "
([A-Za-z0-9:., ]+ UTC)" Doc}
    of match(1:S#E ...) then {List.take {List.drop Doc S} E-S+1}
    end
 end

 Url = "http://tycho.usno.navy.mil/cgi-bin/timer.pl"

in

 {System.showInfo {GetDateString {GetPage Url}}}</lang>

Perl

Library: LWP

<lang perl>use LWP::Simple;

my $url = 'http://tycho.usno.navy.mil/cgi-bin/timer.pl'; get($url) =~ /
(.+? UTC)/

   and print "$1\n";</lang>

Perl 6

<lang perl6>use HTTP::Client; # http://github.com/carlins/http-client/ my $site = "http://tycho.usno.navy.mil/cgi-bin/timer.pl"; HTTP::Client.new.get($site).match(/'
'( .+? <ws> UTC )/)[0].say</lang>

Note that the string between '<' and '>' refers to regex tokens, so to match a literal '<BR>' you need to quote it, while <ws> refers to the built-in token whitespace. Also, whitespace is ignored by default in Perl 6 regexes.

PHP

By iterating over each line:

<lang PHP><?

$contents = file('http://tycho.usno.navy.mil/cgi-bin/timer.pl'); foreach ($contents as $line){

 if (($pos = strpos($line, ' UTC')) === false) continue;
 echo subStr($line, 4, $pos - 4); //Prints something like "Dec. 06, 16:18:03"
 break;

}</lang>

By regular expressions (

Works with: PHP version 4.3.0

):

<lang PHP><?

echo preg_replace( "/^.*
(.*) UTC.*$/su", "\\1", file_get_contents('http://tycho.usno.navy.mil/cgi-bin/timer.pl') ); </lang>

PicoLisp

<lang PicoLisp>(load "@lib/http.l")

(client "tycho.usno.navy.mil" 80 "cgi-bin/timer.pl"

  (when (from "
")
     (pack (trim (till "U"))) ) )</lang>

Output:

-> "Feb. 19, 18:11:37"

PowerShell

<lang powershell>$wc = New-Object Net.WebClient $html = $wc.DownloadString('http://tycho.usno.navy.mil/cgi-bin/timer.pl') $html -match ', (.*) UTC' | Out-Null Write-Host $Matches[1]</lang>

Protium

English dialect, short form, using integrated Rexx pattern matcher: <lang html><@ DEFAREPRS>Rexx Parse</@> <@ DEFPRSLIT>Rexx Parse|'
' UTCtime 'UTC'</@> <@ LETVARURL>timer|http://tycho.usno.navy.mil/cgi-bin/timer.pl</@> <@ ACTRPNPRSVAR>Rexx Parse|timer</@> <@ SAYVAR>UTCtime</@></lang>

English dialect, padded variable-length form: <lang html><# DEFINE WORKAREA PARSEVALUES>Rexx Parse</#> <# DEFINE PARSEVALUES LITERAL>Rexx Parse|'
' UTCtime 'UTC'</#> <# LET VARIABLE URLSOURCE>timer|http://tycho.usno.navy.mil/cgi-bin/timer.pl</#> <# ACT REPLACEBYPATTERN PARSEVALUES VARIABLE>Rexx Parse|timer</#> <# SAY VARIABLE>UTCtime</#></lang>

PureBasic

<lang Purebasic>URLDownloadToFile_( #Null, "http://tycho.usno.navy.mil/cgi-bin/timer.pl", "timer.htm", 0, #Null) ReadFile(0, "timer.htm") While Not Eof(0) : Text$ + ReadString(0) : Wend MessageRequester("Time", Mid(Text$, FindString(Text$, "UTC", 1) - 9 , 8))</lang>

Python

<lang python>import urllib page = urllib.urlopen('http://tycho.usno.navy.mil/cgi-bin/timer.pl') for line in page:

   if ' UTC' in line:
       print line.strip()[4:]
       break

page.close()</lang> Sample output:

Aug. 12, 15:22:08 UTC           Universal Time

R

Library: RCurl

Library: XML

First, retrieve the web page. See HTTP_Request for more options with this. <lang R>library(RCurl) webpage <- getURL("http://tycho.usno.navy.mil/cgi-bin/timer.pl")</lang> Now parse the html code into a tree and retrieve the interesting bit <lang R>library(XML) pagetree <- htmlTreeParse(webpage ) timesnode <- pagetree$children$html$children$body$children$h3$children$pre$children timesnode <- timesnode[names(timesnode)=="text"]</lang> Finally, find the line with universal time and parse it <lang R>timestrings <- sapply(timesnode, function(x) x$value) index <- grep("Universal Time", timestrings) utctimestr <- strsplit(timestrings[index], "\t")$text[1] utctime <- strptime(utctimestr, "%b. %d, %H:%M:%S UTC")

Print the date in any format you desire.

strftime(utctime, "%A, %d %B %Y, %H:%M:%S")</lang>

Monday, 03 August 2009, 16:15:37

Solution with base R.

<lang R>line = grep("UTC", readLines("http://tycho.usno.navy.mil/cgi-bin/timer.pl"), value = TRUE) sub(".*(\\d{2}:\\d{2}:\\d{2}).*", "\\1", line)</lang>

REBOL

<lang REBOL>REBOL [ Title: "Web Scraping" Author: oofoe Date: 2009-12-07 URL: http://rosettacode.org/wiki/Web_Scraping ]

Notice that REBOL understands unquoted URL's

service: http://tycho.usno.navy.mil/cgi-bin/timer.pl

The 'read' function can read from any data scheme that REBOL knows
about, which includes web URLs. NOTE: Depending on your security
settings, REBOL may ask you for permission to contact the service.

html: read service

I parse the HTML to find the first (note the unquoted HTML tag
-- REBOL understands those too), then copy the current time from
there to the "UTC" terminator.

I have the "to end" in the parse rule so the parse will succeed.
Not strictly necessary once I've got the time, but good practice.

parse html [thru
copy current thru "UTC" to end]

print ["Current UTC time:" current]</lang>

Ruby

A verbose example for comparison

<lang ruby>require "open-uri"

open('http://tycho.usno.navy.mil/cgi-bin/timer.pl') do |p|

 p.each_line do |line|
   if line =~ /UTC/
     puts line.match(/ (\d{1,2}:\d{1,2}:\d{1,2}) /)
     break
   end
 end

end </lang>

A more concise example

<lang ruby>require 'open-uri' puts URI.parse('http://tycho.usno.navy.mil/cgi-bin/timer.pl').read.match(/ (\d{1,2}:\d{1,2}:\d{1,2}) UTC/)[1] </lang>

Tcl

<lang tcl>package require http

set request [http::geturl "http://tycho.usno.navy.mil/cgi-bin/timer.pl"] if {[regexp -line {
(.* UTC)} [http::data $request] --> utc]} {

   puts $utc

}</lang>

UNIX Shell

This solution uses curl, which can be downloaded for free, and the popular (at least in the unix world) utilities programs grep and sed.

<lang bash>#!/bin/sh curl -s http://tycho.usno.navy.mil/cgi-bin/timer.pl |

  grep ' UTC' |
  sed -e 's/^
//;s/ UTC.*$//'</lang>

Ursala

This works by launching the wget command in a separate process and capturing its output. The program is compiled to an executable command. <lang Ursala>#import std

import cli

executable ('parameterized',)

whatime =

<.file$[contents: --<>]>+ -+

  @hm skip/*4+ ~=(9%cOi&)-~l*+ *~ ~&K3/'UTC',
  (ask bash)/0+ -[wget -O - http://tycho.usno.navy.mil/cgi-bin/timer.pl]-!+-</lang>

Here is a bash session.

$ whatime
Jun. 26, 20:49:52 UTC

Visual Basic .NET

New, .NET way with StringReader: <lang vbnet>Imports System.Net Imports System.IO

       Dim client As WebClient = New WebClient()
       Dim content As String = client.DownloadString("http://tycho.usno.navy.mil/cgi-bin/timer.pl")
       Dim sr As New StringReader(content)
       While sr.peek <> -1
           Dim s As String = sr.ReadLine
           If s.Contains("UTC") Then
               Dim time As String() = s.Substring(4).Split(vbTab)
               Console.WriteLine(time(0))
           End If
       End While</lang>

Alternative, old fashioned way using VB "Split" function: <lang vbnet>Imports System.Net

       Dim client As WebClient = New WebClient()
       Dim content As String = client.DownloadString("http://tycho.usno.navy.mil/cgi-bin/timer.pl")
       Dim lines() As String = Split(content, vbLf) 'may need vbCrLf 
       For Each line In lines
           If line.Contains("UTC") Then
               Dim time As String() = line.Substring(4).Split(vbTab)
               Console.WriteLine(time(0))
           End If
       Next</lang>