<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet href="/stylesheets/rss.css" type="text/css"?>
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:trackback="http://madskills.com/public/xml/rss/module/trackback/">
  <channel>
    <title>Aaron Feng: Page snatcher</title>
    <link>http://www.aaronfeng.com/articles/2008/02/16/page-snatcher</link>
    <language>en-us</language>
    <ttl>40</ttl>
    <description>Adventures in software development</description>
    <item>
      <title>Page snatcher</title>
      <description>&lt;p&gt;A couple of months ago I wrote a utility that will download a web page with all the dependencies (css, and images) to your hard drive.  All the references in the web page will be changed to refer to your local copy.&lt;/p&gt;

&lt;p&gt;I wrote it as a prototype, and it took me 30 to 40 minutes to write it, so
I'm sure there is room for improvement.  I pointed to a few web pages, such
as amazon, ebay, google, and my blog it worked pretty well!&lt;/p&gt;

&lt;p&gt;The code requires &lt;a href="http://code.whytheluckystiff.net/hpricot/"&gt;Why the lucky stiff's Hpricot library&lt;/a&gt;.  With out further adieu, here is the code below:&lt;/p&gt;

&lt;div class="typocode"&gt;&lt;pre&gt;&lt;code class="typocode_default "&gt;require 'rubygems'
require 'hpricot'
require 'open-uri'
require 'rio'

module Hpricot
  class Elem
    def is_css
      if self.name == &amp;quot;link&amp;quot;
        self[&amp;quot;type&amp;quot;] == &amp;quot;text/css&amp;quot;
      else
        false
      end
    end
    def is_full_path
      if self.name == &amp;quot;link&amp;quot;
        self[&amp;quot;href&amp;quot;][0..6] == &amp;quot;http://&amp;quot;
      elsif self.name == &amp;quot;img&amp;quot;
        self[&amp;quot;src&amp;quot;][0..6] == &amp;quot;http://&amp;quot;
      else
        false
      end
    end
  end
end

if ARGV.size.zero?
  puts &amp;quot;Missing web page you wish to snatch.&amp;quot;
  exit
end

url_scheme = &amp;quot;http://&amp;quot;
url = ARGV[0]
doc = Hpricot(open(url_scheme + url))

Dir.mkdir(url) unless File.directory?(url)

doc.search(&amp;quot;link&amp;quot;) do |item|
  if item.is_css
    if item.is_full_path
      rio(item['href']) &amp;gt; rio(url)
    else
      rio(url_scheme + url + item['href']) &amp;gt; rio(url)
    end

    # nested style sheets in another style sheet
    css_path = File.dirname(item['href'])
    css_file = File.basename(item['href']).scan(/(.*?\.css)/m).flatten.to_s

    file = File.open(url + &amp;quot;/&amp;quot; + css_file,&amp;quot;r&amp;quot;)

    inner_css = file.read.scan(/@import '(.*?\.css)';/m).flatten
    inner_css.each do |css|
      css_url = url_scheme + url + css_path + &amp;quot;/&amp;quot; + css
      rio(css_url) &amp;gt; rio(url)
    end
    file.close

    item['href'] = css_file
  end
end

doc.search(&amp;quot;img&amp;quot;) do |item|
  if item.is_full_path
    rio(item[&amp;quot;src&amp;quot;]) &amp;gt; rio(url)
  else
    rio(url_scheme + url + item[&amp;quot;src&amp;quot;]) &amp;gt; rio(url)
  end
  item[&amp;quot;src&amp;quot;] = item[&amp;quot;src&amp;quot;].split(&amp;quot;/&amp;quot;)[-1]
end

File.open(url + &amp;quot;/&amp;quot; + url + &amp;quot;.html&amp;quot;, &amp;quot;w&amp;quot;) do |file|
  file &amp;lt;&amp;lt; doc.to_s
end&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;</description>
      <pubDate>Sat, 16 Feb 2008 22:02:00 -0500</pubDate>
      <guid isPermaLink="false">urn:uuid:15e1a188-cb53-4264-a006-268c2ff5207d</guid>
      <author>Aaron Feng</author>
      <link>http://www.aaronfeng.com/articles/2008/02/16/page-snatcher</link>
      <category>ruby</category>
      <category>programming</category>
    </item>
    <item>
      <title>"Page snatcher" by Aaron Feng</title>
      <description>&lt;p&gt;Thank you Alicia.  Yes, some modification might be required to work for all sites.&lt;/p&gt;</description>
      <pubDate>Sun, 17 Feb 2008 13:08:16 -0500</pubDate>
      <guid isPermaLink="false">urn:uuid:568e0b63-56b3-4b00-adc1-ff464c445f21</guid>
      <link>http://www.aaronfeng.com/articles/2008/02/16/page-snatcher#comment-4347</link>
    </item>
    <item>
      <title>"Page snatcher" by Alicia</title>
      <description>&lt;p&gt;The code is seamless. Great job! This is the base for all sites, isn't it?&lt;/p&gt;</description>
      <pubDate>Sun, 17 Feb 2008 09:53:45 -0500</pubDate>
      <guid isPermaLink="false">urn:uuid:f8fbbb18-18ca-4029-a58f-13d7d5359148</guid>
      <link>http://www.aaronfeng.com/articles/2008/02/16/page-snatcher#comment-4346</link>
    </item>
  </channel>
</rss>
