rss-prefetch

#!/usr/bin/perl
#
# rss-prefetch -- fetch rss feeds from Oddmuse pages and put them in a
# directory
#
# Copyright (C)  2005  Alex Schroeder <alex@emacswiki.org>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
# 02111-1307, USA.

use LWP::UserAgent;

my $verbose = 0;

sub UrlEncode {
  my $str = shift;
  return '' unless $str;
  my @letters = split(//, $str);
  my @safe = ('a' .. 'z', 'A' .. 'Z', '0' .. '9', '-', '_', '.', '!', '~', '*', "'", '(', ')', '#');
  foreach my $letter (@letters) {
    my $pattern = quotemeta($letter);
    if (not grep(/$pattern/, @safe)) {
      $letter = sprintf("%%%02x", ord($letter));
    }
  }
  return join('', @letters);
}

sub GetRaw {
  my $uri = shift;
  my $ua = LWP::UserAgent->new;
  my $response = $ua->get($uri);
  warn $response->status_line if $response->code != 200 and $verbose;
  return $response->content;
}

sub ParseData {
  my $data = shift;
  my %result;
  while ($data =~ /(\S+?): (.*?)(?=\n[^ \t]|\Z)/sg) {
    my ($key, $value) = ($1, $2);
    $value =~ s/\n\t/\n/g;
    $result{$key} = $value;
  }
  return %result;
}

my @pages = qw(RecentNearChanges RecentFarChanges
               RecentFarChangesEnglishOnly);
my $dir = '/org/org.emacswiki/htdocs/community/page/';
my $cache = '/org/org.emacswiki/htdocs/community/rss/';
my %done = ();
foreach my $page (@pages) {
  local $/ = undef;   # Read complete files
  open(F, $dir . uc(substr($page, 0, 1)) . '/' . $page . '.pg')
    or die "Cannot read $_ page file";
  my $data = <F>;
  close(F);
  my %result = ParseData($data);
  while ($result{text} =~ /<rss(.*?)>/sg) {
    my @uris = map { s/^"?(.*?)"?$/$1/; $_; } split(' ', $1);
    foreach my $uri (@uris) {
      next if $uri =~ /^\d+$/;
      next if $done{$uri};
      $done{$uri} = 1;
      my $rss = GetRaw($uri) or $verbose and warn "No data from $uri";
      next unless $rss;
      my $name = UrlEncode($uri);
      open(F, "> $cache$name")
	or $verbose and warn "Cannot write $uri page file";
      print F $rss;
      close F;
    }
  }
};