menu

Sunday, 2 December 2012

Web Scraping Amazon with PHP

This snippet of PHP code demonstrates web scraping. It reads a sample page from Amazon.com, compares the HTML text against certain class name and outputs that matched text in an RSS feed.


<?php
$now   = date("D, d M Y H:i:s T");
$ASIN  = $url = $img = $title = $bio = $name = "";
$head = '<?xml version="1.0" encoding="ISO-8859-1"?>';
$head .= '<rss version="2.0">';
$head .= '<channel>';
$head .= '<title>Amazon </title>';
$head .= '<link>http://www.amazon.com</link>';
$head .= '<description>Amazon RSS Feed</description>';
$url = "http://www.amazon.com/Best-Sellers-Kindle-Store/zgbs/digital-text/";
$text = file_get_html($url);
foreach ($text->find("div.zg_item_compact") as $class) {
  foreach ($class->find('strong.price') as $price) {
    if ($price->plaintext == "Free") {
            $rssfeed .= '<item>';
            foreach ($class->find("div.zg_title a") as $book) {                
              preg_match("/\/dp\/(.*)\/ref/", $book->href, $matches);                
              $ASIN  = trim($matches[1]);
              $url   = "http://www.amazon.com/dp/" . $ASIN . "/?tag=publisherapi-20";
              $img   = "http://images.amazon.com/images/P/" . $ASIN . ".01.LZZZZZZZ.jpg";
              $title = htmlentities(trim($book->plaintext));                
              $rssfeed .= '<title>' . $title . '</title>';
              $rssfeed .= '<link>' . $url . '</link>';
              $rssfeed .= '<guid isPermaLink="true">' . $url . '</guid>';
              $rssfeed .= '<description>';
            }
            foreach ($class->find("div.zg_byline a") as $author) {
                $bio  = "http://www.amazon.com" . $author->href . "/?tag=publisherapi-20";
                $name = htmlentities(trim($author->plaintext));
                $rssfeed .= 'By <a href="' . $authorURL . '">' . $name . '</a>';
            }
            $rssfeed .= '</description>';
            $rssfeed .= '<pubDate>' . $now . '</pubDate>';
            $rssfeed .= '</item>';
        }
    }
}
$footer  = '</channel></rss>';
$rssfeed = $head . $rssfeed . $footer;
$fh      = fopen("amazon.rss", "w");
fwrite($fh, $rssfeed);
fclose($fh);
?>