Yahoo! search interface

From Rosetta Code
Revision as of 10:12, 14 May 2009 by rosettacode>Dkf (→‎Tcl: Added implementation)
Task
Yahoo! search interface
You are encouraged to solve this task according to the task description, using any language you may know.

Create a class for searching Yahoo results. It must implement a Next Page method, and read URL, Title and Content from results.

C#

<lang csharp>using System; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Collections; using System.Collections.Generic; using System.Linq;

class YahooSearch {

   private string query;
   private string content;
   private int page = 1;
   public YahooSearch(string query) {
       this.query = query;
       this.content = new WebClient().DownloadString("http://search.yahoo.com/search?p=" + query);
   }
   public YahooSearch(string query, int page) {
       this.query = query;
       this.page = page;
       this.content = new WebClient().DownloadString(String.Format("http://search.yahoo.com/search?p={0}&b={1}", query, ((this.page - 1) * 10) + 1));
   }
   string Fix(string x) {
       x = x.Replace("", "").Replace("", "").Replace("", "").Replace("", "").Replace("...", "");

int i = x.IndexOf("</a>");

       if (i > 0) return x.Substring(0, i);
       else return x;        
   } 
   public YahooResult[] Results {
       get {
           ArrayList results = new ArrayList();

foreach (Match e in new Regex("<a class=\"yschttl spt\" href=\".+?\" >(.+?)</a>(?(

.+?

))

(.+?)

(.+?)").Matches(this.content)) {

               string rurl = Fix(e.Groups[3].Value);
               string rtitle = Fix(e.Groups[1].Value);
               string rcontent = Fix(e.Groups[2].Value);
               
               results.Add(new YahooResult(rurl, rtitle, rcontent));
           }
           return (YahooResult[])results.ToArray(typeof(YahooResult));
       }
   }
   public YahooSearch NextPage() {
       return new YahooSearch(this.query, this.page + 1);
   }
   public YahooSearch GetPage(int page) {
       return new YahooSearch(this.query, page);
   }

}

class YahooResult {

   public string URL { get; set; }
   public string Title { get; set; }
   public string Content { get; set; }
   public YahooResult(string url, string title, string content) {
       this.URL = url;
       this.Title = title;
       this.Content = content;
   }

}

// Usage:

class Prog {

   static void Main() {
       YahooSearch x = new YahooSearch("test");
       foreach (YahooResult result in x.Results) {
           Console.WriteLine(result.Title);
       }
   }

}</lang>

Perl

<lang perl>package YahooSearch;

use Encode; use HTTP::Cookies; use WWW::Mechanize;

  1. --- Internals -------------------------------------------------

sub apply (&$)

{my $f = shift; local $_ = shift; $f->(); return $_;}
  1. We construct a cookie to get 100 results per page and prevent
  2. "enhanced results".

my $search_prefs = 'v=1&n=100&sm=' .

   apply {s/([^a-zA-Z0-9])/sprintf '%%%02X', ord $1/ge}
   join '|',
   map {'!' . $_}
   qw(hsb Zq0 XbM sss dDO VFM RQh uZ0 Fxe yCl GP4 FZK yNC mEG niH);

my $cookies = HTTP::Cookies->new; $cookies->set_cookie(0, 'sB', $search_prefs, '/', 'search.yahoo.com');

my $mech = new WWW::Mechanize

  (cookie_jar => $cookies,
   stack_depth => 0);

sub read_page

{my ($next, $page, @results) =
    ($mech->find_link(text => 'Next >')->url,
     decode 'iso-8859-1', $mech->content);
 while ($page =~ m

{

<a \s class="yschttl \s spt" \s href=" ([^"]+) " \s* > #" (.+?) </a> .+?
(.+?)
}xg)
    {push @results, {url => $1, title => $2, content => $3};
     foreach ( @{$results[-1]}{qw(title content)} )
        {s/<.+?>//g;
         $_ = encode 'utf8', $_;}}
 return $next, \@results;}
  1. --- Methods ---------------------------------------------------

sub new

{my $invocant = shift;
 my $class = ref($invocant) || $invocant;
 $mech->get('http://search.yahoo.com/search?p=' . apply
    {s/([^a-zA-Z0-9 ])/sprintf '%%%02X', ord $1/ge;
     s/ /+/g;}
   shift);
 my ($next, $results) = read_page();
 return bless {link_to_next => $next, results => $results}, $class;}

sub results

{@{shift()->{results}};}

sub next_page

{my $invocant = shift;
 my $next = $invocant->{link_to_next};
 unless ($next)
    {$invocant->{results} = [];
     return undef;}
 $mech->get($next);
 ($next, my $results) = read_page();
 $invocant->{link_to_next} = $next;
 $invocant->{results} = $results;
 return 1;}</lang>

Python

<lang python>import urllib import re

def fix(x):

   x =  x.replace("","").replace("","").replace("","").replace("","").replace("...","")
return x[:x.find("</a>

")]

class YahooSearch:

   def __init__(self, query, page=1):       
       self.query = query
       self.page = page
       self.url = "http://search.yahoo.com/search?p=%s&b=%s" %(self.query, ((self.page - 1) * 10 + 1))
       self.content = urllib.urlopen(self.url).read()        

   def getresults(self):
       self.results = []

for i in re.findall("<a class=\"yschttl spt\" href=\".+?\" >(.+?)</a>"+\ "

(.+?)

(.+?)",self.content):

           title = fix(i[0])
           content = fix(i[1])
           url = fix(i[2])

           self.results.append(YahooResult(title, content, url))

       return self.results

   def getnextpage(self):
       return YahooSearch(self.query, self.page+1)

   results = property(fget=getresults)
   nextpage = property(fget=getnextpage)

class YahooResult:

   def __init__(self,title,content,url):
       self.title = title
       self.content = content
       self.url = url

  1. Usage:

x = YahooSearch("test")

for result in x.results:

   print result.title</lang>

Tcl

Translation of: Python

<lang tcl>package require http

proc fix s {

   string map {... ""  ""  ""  "" "" ""} \

[regsub "</a>.*" $s ""]

} proc YahooSearch {term {page 1}} {

   # Build the (ugly) scraper URL

append re {<a class="yschttl spt" href=".+?" >(.+?)</a>} append re {

(.+?)} append re {

(.+?)}

   # Perform the query; note that this handles special characters
   # in the query term correctly
   set q [http::formatQuery p $term b [expr {$page*10-9}]]
   set token [http::geturl http://search.yahoo.com/search?$q]
   set data [http::data $token]
   http::cleanup $token
   # Assemble the results into a nice list
   set results {}
   foreach {- title content url} [regexp -all -inline $re $data] {
       lappend results [fix $title] [fix $content] [fix $url]
   }
   return $results

}

  1. Usage:

foreach {title content url} [YahooSearch "test"] {

   puts $title

}</lang> Tcl 8.6 can use its coroutine support to produce an iterator more like that Python code: <lang tcl>proc yahoo! term {

   coroutine yahoo![incr ::yahoo] appy {term {
       yield [info coroutine]
       while 1 {
           set results [YahooSearch $term [incr step]]
           if {[llength $results] == 0} {
               return -code break
           }
           foreach {t c u} $result {
               yield [dict create title $t content $c url $u]
           }
       }
   }} $term

}

  1. test...

set it [yahoo! "test"] while 1 {

   puts [dict get [$it] title]

}</lang>