Yahoo! search interface: Difference between revisions

Content added Content deleted

Inline

Revision as of 22:14, 21 January 2010

Create a class for searching Yahoo! results. It must implement a Next Page method, and read URL, Title and Content from results.

AutoHotkey

translated from python example <lang AutoHotkey>test: yahooSearch("test", 1) yahooSearch("test", 2) return

yahooSearch(query, page) {

 global
 start := ((page - 1) * 10) + 1
 filedelete, search.txt
 urldownloadtofile, % "http://search.yahoo.com/search?p=" . query
 . "&b=" . start, search.txt
 fileread, content, search.txt

reg = <a class="yschttl spt" href=".+?" >(.+?)</a>

(.+?)

 index := found := 1
 while (found := regexmatch(content, reg, self, found + 1))
 {
   msgbox % title%A_Index% := fix(self1)
   content%A_Index% := fix(self2)
   url%A_Index% := fix(self3)
 }

}

fix(url) {

if pos := instr(url, "</a>")

StringLeft, url, url, pos - 1 url := regexreplace(url, "<.*?>") return url }</lang>

Instead of a class the implementation defines a function which returns a lazy list of result pages. This also makes it possible to request e.g. the first and the third page without any ressources wasted on an unneeded second page.

We implement some simple parsing with logic programming. Regular expressions in Oz don't seem to support lazy quantification which makes parsing the result pages with them difficult. <lang oz>declare

 [HTTPClient] = {Module.link ['x-ozlib://mesaros/net/HTTPClient.ozf']}
 [StringX] = {Module.link ['x-oz://system/String.ozf']}
 [Regex] = {Module.link ['x-oz://contrib/regex']}

 proc {ExampleUsage}    
    Pages = {YahooSearch "Rosetta code"}
 in
    {Inspector.configure widgetShowStrings true}
    %% Make pages 1 and 3 needed, i.e. they are retrieved and displayed
    %% without further user interaction 
    {ForAll {Nth Pages 1} Value.makeNeeded}
    {ForAll {Nth Pages 3} Value.makeNeeded}
    %% Display the infinite list of search result pages.
    {Inspect Pages}
 end

 %% Returns a lazy list of pages.
 %% A page is a list of entries like this: result(url:U title:T content:C).
 fun {YahooSearch Query}
    local

Client = {CreateClient}

in

fun {OpenURL Url Params} OutParams in {Client getService(Url Params ?OutParams ?_)} OutParams.sOut end

    end
    
    fun {Page Nr}

StartResult = (Nr-1)*10+1 %% only retrieve it whe really needed Doc = {Value.byNeed fun {$} {OpenURL "http://search.yahoo.com/search" ["p"#Query "b"#{Int.toString StartResult}]} end} RE = "<a class=\"yschttl spt\" href="

in

%% We yield the results lazily because we want to be able %% to build the pages list structure without necessarily building %% the single elements (e.g. retrieve page 1 and 3 but not 2). for match(0:_#E ...) in {Regex.allMatches RE Doc} yield:Yield do Xs = {List.drop Doc E} in {Yield {ParseEntry Xs}} end

    end
 in
    for PageNr in 1;PageNr+1 yield:Yield do

{Yield {Page PageNr}}

    end
 end

 fun {CreateClient}
    Client = {New HTTPClient.cgiGET

init(inPrms(toFile:false toStrm:true) httpReqPrms )}

    {Finalize.register Client proc {$ C} {C closeAll(true)} end}
 in
    Client
 end
  
 fun {ParseEntry Xs}
    proc {Parse Root}

R1 R2 R3 R4 R4 R5 R6 R7 Url = {Fix {QuotedString Xs R1}} {Const ">" R1 R2} Title = {Fix {Until "</a>" R2 R3}}

{Const "" R3 R4}

choice %% "enchanted" result?

{Const "

" R4 R5} {Until "

" R5 R6 _}

[] %% result with links into document

{Const "

" R4 R5} {Until "

" R5 R6 _}

[] %% PDF file

{Const "

" R4 R5} {Until "</a>

" R5 R6 _}

[] %% With Review

{Const "

" R4 R5}

R6 = nil %% no nice abstract when a review is there [] %% normal result R6 = R4 end Abstract = choice

{Const "

" R6 R7} {Fix {Until "

" R7 _}} [] {Const "

" R6 R7} {Fix {Until "

" R7 _}}

[] "" end

in

Root = result(url:Url title:Title content:Abstract)

    end
 in
    {CondSelect {SearchOne Parse} 1 parseError}
 end

 %% Result: contents of Xs until M is found.
 %% Xs = {Append M Yr}
 fun {Until M Xs ?Yr}
    L R
 in
    {List.takeDrop Xs {Length M} L R}
    if L == M then Yr = R nil
    elsecase Xs of X|Xr then X|{Until M Xr Yr}
    [] nil then Yr = nil nil
    end
 end

 %% Assert that Xs starts with C. Return the rest in Ys.
 proc {Const C Xs ?Ys}
    {List.takeDrop Xs {Length C} C Ys}
 end

 %% Assert that a quoted string follows.
 %% Returns the unquoted string and binds Ys to the rest of Xs.
 fun {QuotedString &"|Xs ?Ys}
    fun {Loop Xs Ys}

case Xs of &\\|&"|Xr then &\\|&"|{Loop Xr Ys} [] &"|Xr then Ys = Xr nil [] X|Xr then X|{Loop Xr Ys} end

    end
 in
    {Loop Xs Ys}
 end

 %% Remove formatting tags.
 fun {Fix Xs}

{Until "</a>"

     {FoldL ["" "" "" "" "..."]
      fun {$ Ys Z}

{StringX.replace Ys Z ""}

      end
      Xs}
     _}
 end

in

 {ExampleUsage}</lang>

Perl

<lang perl>package YahooSearch;

use Encode; use HTTP::Cookies; use WWW::Mechanize;

--- Internals -------------------------------------------------

sub apply (&$)

{my $f = shift; local $_ = shift; $f->(); return $_;}

We construct a cookie to get 100 results per page and prevent
"enhanced results".

my $search_prefs = 'v=1&n=100&sm=' .

   apply {s/([^a-zA-Z0-9])/sprintf '%%%02X', ord $1/ge}
   join '|',
   map {'!' . $_}
   qw(hsb Zq0 XbM sss dDO VFM RQh uZ0 Fxe yCl GP4 FZK yNC mEG niH);

my $cookies = HTTP::Cookies->new; $cookies->set_cookie(0, 'sB', $search_prefs, '/', 'search.yahoo.com');

my $mech = new WWW::Mechanize

  (cookie_jar => $cookies,
   stack_depth => 0);

sub read_page

{my ($next, $page, @results) =
    ($mech->find_link(text => 'Next >')->url,
     decode 'iso-8859-1', $mech->content);
 while ($page =~ m

{

<a \s class="yschttl \s spt" \s href=" ([^"]+) " \s* > #" (.+?) </a> .+?

(.+?)

}xg)

    {push @results, {url => $1, title => $2, content => $3};
     foreach ( @{$results[-1]}{qw(title content)} )
        {s/<.+?>//g;
         $_ = encode 'utf8', $_;}}
 return $next, \@results;}

--- Methods ---------------------------------------------------

sub new

{my $invocant = shift;
 my $class = ref($invocant) || $invocant;
 $mech->get('http://search.yahoo.com/search?p=' . apply
    {s/([^a-zA-Z0-9 ])/sprintf '%%%02X', ord $1/ge;
     s/ /+/g;}
   shift);
 my ($next, $results) = read_page();
 return bless {link_to_next => $next, results => $results}, $class;}

sub results

{@{shift()->{results}};}

sub next_page

{my $invocant = shift;
 my $next = $invocant->{link_to_next};
 unless ($next)
    {$invocant->{results} = [];
     return undef;}
 $mech->get($next);
 ($next, my $results) = read_page();
 $invocant->{link_to_next} = $next;
 $invocant->{results} = $results;
 return 1;}</lang>

Python

This example is incorrect. It does not accomplish the given task. Please fix the code and remove this message.

<lang python>import urllib import re

def fix(x):

   x =  x.replace("","").replace("","").replace("","").replace("","").replace("...","")

return x[:x.find("</a>

")]

class YahooSearch:

   def __init__(self, query, page=1):       
       self.query = query
       self.page = page
       self.url = "http://search.yahoo.com/search?p=%s&b=%s" %(self.query, ((self.page - 1) * 10 + 1))
       self.content = urllib.urlopen(self.url).read()        

   def getresults(self):
       self.results = []

for i in re.findall("<a class=\"yschttl spt\" href=\".+?\" >(.+?)</a>"+\ "

(.+?)

(.+?)",self.content):

           title = fix(i[0])
           content = fix(i[1])
           url = fix(i[2])

           self.results.append(YahooResult(title, content, url))

       return self.results

   def getnextpage(self):
       return YahooSearch(self.query, self.page+1)

   results = property(fget=getresults)
   nextpage = property(fget=getnextpage)

class YahooResult:

   def __init__(self,title,content,url):
       self.title = title
       self.content = content
       self.url = url

Usage:

x = YahooSearch("test")

for result in x.results:

   print result.title</lang>

R

Library: RCurl

Library: XML

Rather than using regexes to find the content (like some of the other solutions here) this method parses the HMTL and finds the appropriate sections. <lang R>YahooSearch <- function(query, page=1, .opts=list(), ignoreMarkUpErrors=TRUE) {

  if(!require(RCurl) || !require(XML))
  {
     stop("Could not load required packages")
  }   
  
  # Replace " " with "%20", etc
  query <- curlEscape(query)
  
  # Retrieve page
  b <- 10*(page-1)+1
  theurl <- paste("http://uk.search.yahoo.com/search?p=",
     query, "&b=", b, sep="")
  webpage <- getURL(theurl, .opts=.opts)
  
  # Save search for nextpage function
  .Search <- list(query=query, page=page, .opts=.opts, 
     ignoreMarkUpErrors=ignoreMarkUpErrors)
  assign(".Search", .Search, envir=globalenv())
    
  # Parse HTML; retrieve results block
  webpage <- readLines(tc <- textConnection(webpage)); close(tc)
  if(ignoreMarkUpErrors)
  {
     pagetree <- htmlTreeParse(webpage, error=function(...){})     
  } else
  {
     pagetree <- htmlTreeParse(webpage)
  }
  
  
  findbyattr <- function(x, id, type="id")
  {
     ids <- sapply(x, function(x) x$attributes[type])
     x[ids==id]   
  }
     
  body <- pagetree$children$html$children$body
  bd <- findbyattr(body$children$div$children, "bd")
  left <- findbyattr(bd$div$children$div$children, "left")
  web <- findbyattr(left$div$children$div$children, "web") 
  resol <- web$div$children$ol
  
  #Get url, title, content from results
  gettextfromnode <- function(x)
  {
     un <- unlist(x$children)  
     paste(un[grep("value", names(un))], collapse=" ") 
  }
  
  n <- length(resol)
  results <- list()
  length(results) <- n
  for(i in 1:n)
  {
     mainlink <- resoli$children$div$children[1]$div$children$h3$children$a
     url <- mainlink$attributes["href"]
     title <- gettextfromnode(mainlink)
     
     contenttext <- findbyattr(resoli$children$div$children[2], "abstr", type="class")      
     if(length(contenttext)==0)
     {
         contenttext <- findbyattr(resoli$children$div$children[2]$div$children$div$children, 
           "sm-abs", type="class")
     }
     
     content <- gettextfromnode(contenttext$div)        
     resultsi <- list(url=url, title=title, content=content)                                                        
  }
  names(results) <- as.character(seq(b, b+n-1)) 
  results

}

nextpage <- function() {

  if(exists(".Search", envir=globalenv())) 
  {
     .Search <- get(".Search", envir=globalenv())   
     .Search$page  <- .Search$page + 1L
     do.call(YahooSearch, .Search)
  } else
  {
     message("No search has been performed yet")
  }

}

Usage

YahooSearch("rosetta code") nextpage()</lang>

Ruby

Uses

Library: RubyGems

Library: Hpricot

to parse the HTML. Someone more skillful than I at XPath or CSS could tighten up the parse_html method.

<lang ruby>require 'open-uri' require 'hpricot'

SearchResult = Struct.new(:url, :title, :content)

class SearchYahoo

 @@urlinfo = [nil, 'ca.search.yahoo.com', 80, '/search', nil, nil]

 def initialize(term)
   @term = term
   @page = 1
   @results = nil
   @url = URI::HTTP.build(@@urlinfo)
 end

 def next_result
   if not @results
     @results = []
     fetch_results
   elsif @results.empty?
     next_page
   end
   @results.shift
 end

 def fetch_results
   @url.query = URI.escape("p=%s&b=%d" % [@term, @page])
   doc = open(@url) { |f| Hpricot(f) }  
   parse_html(doc)
 end

 def next_page
   @page += 10
   fetch_results
 end

 def parse_html(doc)
   doc.search("div#main").search("div").each do |div|
     next unless div.has_attribute?("class") and div.get_attribute("class").index("res") == 0
     result = SearchResult.new
     div.search("a").each do |link|
       next unless link.has_attribute?("class") and link.get_attribute("class") == "yschttl spt"
       result.url = link.get_attribute("href")
       result.title = link.inner_text
     end
     div.search("div").each do |abstract|
       next unless abstract.has_attribute?("class") and abstract.get_attribute("class").index("abstr")
       result.content = abstract.inner_text
     end
     @results << result
   end
 end

end

s = SearchYahoo.new("test") 15.times do |i|

 result = s.next_result
 puts i+1
 puts result.title
 puts result.url
 puts result.content
 puts

end</lang>

Tcl

Translation of: Python

<lang tcl>package require http

proc fix s {

   string map {... ""  ""  ""  "" "" ""} \

[regsub "</a>.*" $s ""]

} proc YahooSearch {term {page 1}} {

   # Build the (ugly) scraper URL

append re {<a class="yschttl spt" href=".+?" >(.+?)</a>} append re {

(.+?)} append re {

(.+?)}

   # Perform the query; note that this handles special characters
   # in the query term correctly
   set q [http::formatQuery p $term b [expr {$page*10-9}]]
   set token [http::geturl http://search.yahoo.com/search?$q]
   set data [http::data $token]
   http::cleanup $token

   # Assemble the results into a nice list
   set results {}
   foreach {- title content url} [regexp -all -inline $re $data] {
       lappend results [fix $title] [fix $content] [fix $url]
   }

   # set up the call for the next page
   interp alias {} Nextpage {} YahooSearch $term [incr page]

   return $results

}

Usage: get the first two pages of results

foreach {title content url} [YahooSearch "test"] {

   puts $title

} foreach {title content url} [Nextpage] {

    puts $title

}</lang>

Works with: Tcl version 8.6

With Tcl 8.6, more options are available for managing the global state, through objects and coroutines. First, an object-based solution that takes the basic YahooSearch functionality and dresses it up to be more Tcl-like: <lang tcl>package require Tcl 8.6

oo::class create WebSearcher {

   variable page term results
   constructor searchTerm {
       set page 0
       set term $searchTerm
       my nextPage
   }
   # This next method *is* a very Tcl-ish way of doing iteration.
   method for {titleVar contentsVar urlVar body} {
       upvar 1 $titleVar t $contentsVar c $urlVar v
       foreach {t c v} $results {
           uplevel 1 $body
       }
   }
   # Reuse the previous code for simplicity rather than writing it anew
   # Of course, if we were serious about this, we'd put the code here properly
   method nextPage {} {
       set results [YahooSearch $term [incr page]]
       return
   }

}

How to use. Note the 'foreach' method use below; new "keywords" as methods!

set ytest [WebSearcher new "test"] $ytest for title - url {

   puts "\"$title\" : $url"

} $ytest nextPage $ytest for title - url {

   puts "\"$title\" : $url"

} $ytest delete ;# standard method that deletes the object</lang> However, the paradigm of an iterator is also interesting and is more appropriately supported through a coroutine. This version conceals the fact that the service produces output in pages; care should be taken with it because it can produce rather a lot of network traffic... <lang tcl>package require Tcl 8.6

proc yahoo! term {

   coroutine yahoo![incr ::yahoo] apply {term {
       yield [info coroutine]
       while 1 {
           set results [YahooSearch $term [incr step]]
           if {[llength $results] == 0} {
               return -code break
           }
           foreach {t c u} $results {
               yield [dict create title $t content $c url $u]
           }
       }
   }} $term

}

test by getting first fifty titles...

set it [yahoo! "test"] for {set i 50} {$i>0} {incr i -1} {

   puts [dict get [$it] title]
   after 300  ;# Slow the code down... :-)

}</lang>

Another approach: uses a class as specified in the task. Also, uses an html parser from

Library: tcllib

(parsing html with regular expressions is a particular annoyance of mine).

Works with: Tcl version 8.6

<lang tcl>package require Tcl 8.6 package require http package require htmlparse package require textutil::adjust

oo::class create yahoosearch {

   method search {s} {
       my variable searchterm page baseurl
       set searchterm $s
       set page 1
       set baseurl {http://ca.search.yahoo.com/search}
   }

   method getresults {} {
       my variable state results current_data
       set results [list]
       set current_data [dict create]
       set state looking_for_results
       htmlparse::parse -cmd [list [self] html_parser_callback] [my gethtml]
   }

   method nextpage {} {
       my variable page
       incr page 10
       my getresults
   }
   
   method nextresult {} {
       my variable results page
       if { ! [info exists results]} {
           my getresults
       } elseif {[llength $results] == 0} {
           my nextpage
       }
       set results [lassign $results result]
       return $result
   }

   method gethtml {} {
       my variable searchterm page baseurl
       set url [format {%s?%s} $baseurl [::http::formatQuery p $searchterm b $page]]
       set response [http::geturl $url]
       set html [http::data $response]
       http::cleanup $response
       return $html
   }

   method html_parser_callback {tag slash param textBehindTheTag} {
       my variable state results current_data
       switch -exact -- $state {
           looking_for_results {
               if {$tag eq "div" && [string first {id="main"} $param] != -1} {
                   set state ready
               }
           }
           ready {
               if {($tag eq "div" && [string first {class="res} $param] != -1) ||
                   ($tag eq "html" && $slash eq "/")
               } { #" -- unbalanced quote disturbs syntax highlighting
                   if {[dict size $current_data] > 0} {lappend results $current_data}
                   set current_data [dict create]
                   set state getting_url
               }
           }
           getting_url {
               if {$tag eq "a" && [string match "*yschttl spt*" $param]} {
                   if {[regexp {href="(.+?)"} $param - url]} {
                       dict set current_data url $url
                   } else {
                       dict set current_data url "no href in tag params: '$param'"
                   }
                   dict set current_data title $textBehindTheTag
                   set state getting_title
               }
           }
           getting_title {
               if {$tag eq "a" && $slash eq "/"} {
                   set state looking_for_abstract
               } else {
                   dict append current_data title $textBehindTheTag
               }
           }
           looking_for_abstract {
               if {$tag eq "span" && [string first {class="url} $param] != -1} {
                   set state ready
               } elseif {$tag eq "div" && [string first {class="abstr} $param] != -1} {
                   dict set current_data abstract $textBehindTheTag
                   set state getting_abstract
               }
           }
           getting_abstract {
               if {$tag eq "div" && $slash eq "/"} {
                   set state ready
               } else {
                   dict append current_data abstract $textBehindTheTag
               }
           }
       }
   }

}

yahoosearch create searcher searcher search "search text here"

for {set x 1} {$x <= 15} {incr x} {

   set result [searcher nextresult]
   dict with result {
       puts $title
       puts $url
       puts [textutil::adjust::indent [textutil::adjust::adjust $abstract] "  "]
       puts ""
   }

}</lang>