[mythtv-users] Australian TV grabber (tv_grab_au)
Michael Smith
myth at immir.com
Tue Apr 19 13:18:30 UTC 2005
> I've attached a copy to this to this message...
*sigh*
As usual I missed, the release version is at:
<http://immir.com/tv_grab_au>
Only really foxtel channel list has been extended, so no need to
download it if you don't use foxtel.
Michael.
-------------- next part --------------
#!/usr/bin/perl -w
eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
if 0; # not running under some shell
# --- new NMSN Australian TV grabber by Michael 'Immir' Smith...
# --- $Id: tv_grab_au,v 1.18 2005/04/19 13:05:54 mythtv Exp $
#
# A current version of this script should usually be available here:
#
# <http://immir.com/tv_grab_au>
#
# pod documentation coming shortly...
#
# in brief: --configure configuration
# --list-channels show subscribed channels and xmltvids
# --slow download a page for each show for details
# --days <n> days to grab
# --output <file> xml file for output
use strict;
use Getopt::Long;
use LWP::UserAgent;
use Date::Manip;
use File::Path;
use File::Basename;
use Data::Dumper;
use HTML::TreeBuilder;
use XMLTV;
use XMLTV::Ask;
use XMLTV::ProgressBar;
use XMLTV::Config_file;
# --- global parameters
my $conf; # configuration parameters
my $channels; # ref to hash of subscribed channel names to xmltvids
my %chanid; # mapping from lowercase channel name to xmltv channel id
my $lang = "en";
my $spoofMSID = 1; # spoof random MSIDs to avoid redirects?
my $output_dir = "/var/local/tv_grab_au";
my $cache_file = "$output_dir/cached.pl";
# --- various NMSN site URLs
my $NMSN = "http://tvguide.ninemsn.com.au";
my $GUIDE = "http://tvguide.ninemsn.com.au/guide";
my $DETAILS = "http://tvguide.ninemsn.com.au/closeup/default.asp";
my $TVTODAY = "http://tvguide.ninemsn.com.au/todaytv/default.asp";
my ($Revision) = '$Revision: 1.18 $' =~ /Revision:\s*(\S+)/;
my $debug = 0;
$Data::Dumper::Sortkeys = 1;
$Data::Dumper::Indent = 1;
# --- Command-line options
my $opt_configfile;
my $opt_configure;
my $opt_output = "$output_dir/guide.xml";
my $opt_gui = 1;
my $opt_slow = 0;
my $opt_list_channels;
my $opt_days = 5;
GetOptions( 'days=i' => \$opt_days,
'list-channels' => \$opt_list_channels,
'output=s' => \$opt_output,
'config-file=s' => \$opt_configfile,
'configure' => \$opt_configure,
'gui=s' => \$opt_gui,
'slow' => \$opt_slow,
'debug' => \$debug );
XMLTV::Ask::init($opt_gui);
my $config_file = XMLTV::Config_file::filename
($opt_configfile, 'tv_grab_au', not $debug );
# --- read and parse configuration --- must do this before configuring
# in case the user has chosen non-default xmltvids for some channels...
if (-r $config_file) {
local (@ARGV, $/) = ($config_file, undef);
no warnings 'all';
eval <>;
die "error in conf file: $@" if $@ and not $opt_configure;
print "unable to read configuration file... continuing configuring..." if $@;
}
# --- extract sorted subscribed channel list from config-file hash;
# also compute canonicalised lowercased channel to xmltvid hash
%chanid = map { lc $_, $channels->{$_} } keys %$channels;
# --- are we configuring?
if ($opt_configure) { configure($config_file), exit 0 }
# --- are we just listing channels?
if ($opt_list_channels) {
print " $_ -> $channels->{$_}\n" for sort keys %$channels;
exit 0;
}
# --- we must be grabbing program information...
my $runtime = time();
# whether we get individual program details (follow closeup pids)
my $slow = $opt_slow || $conf->{slow};
if ($debug) {
print "tv_grab_au revision $Revision\n";
print "configuration:\n";
print " TZ = $conf->{TZ} slow = $conf->{slow}\n";
print " services:\n";
for my $service (@{$conf->{services}}) {
print " $service->{name}: " .
" region=$service->{region} id=$service->{regionid}\n";
}
print "channel list:\n";
print " $_ -> $channels->{$_}\n" for keys %$channels;
print "\n";
}
print fixplural("grabbing $opt_days days into $opt_output\n");
# --- first get list of shows seen last time
my ($cached, $newcache);
if (-r $cache_file) {
local (@ARGV, $/) = ($cache_file, undef);
no warnings 'all';
eval <>;
die "$@" if $@;
}
# --- now get the shows...
my @shows;
for my $day (0 .. $opt_days-1) {
my $date = Ymd(DateCalc("today", "+ $day days"));
my $date6am = ParseDate("6am $date");
for my $service (@{$conf->{services}}) {
my $guidedata = get_guide_page($service, dmY($date6am)) or next;
my $tree = HTML::TreeBuilder->new_from_content($guidedata);
for ($tree->look_down('_tag' => 'table', 'class' => 'tv')) {
# extract channel names from the first row of the table
# (this row has align=middle and the channels are in bold)
my @hdr = map { $_->as_text }
$_->look_down('_tag' => 'tr', 'align' => 'middle')
->look_down('_tag' => 'b');
my @span = (0) x @hdr; # rowspans to infer columns
my $row = 0; # row number (to compute start times)
for ($_->look_down('_tag' => 'tr', 'valign' => 'top')) {
my @idx = grep { $span[$_] == 0 } 0..$#hdr; # columns for this row
for ($_->look_down('_tag' => 'td', 'class' => 'tv')) {
my $col = shift @idx;
my $rowspan = $_->attr("rowspan") || 1;
$span[$col] = $rowspan;
my $channel = $hdr[$col];
next unless $chanid{lc $channel}; # not grabbing this channel
my ($e) = $_->content_list;
next unless ref($e) eq 'HTML::Element';
next unless $e->tag eq 'a';
my ($pid) = $e->as_HTML =~ /pid=(\d+)/;
next if $pid == 0; # null programme at bottom of table
my $title = $e->as_text();
# --- check (pid, row, rowspan, title) against old data
my $cache_id = "$date:$pid:$row:$rowspan:$title";
if ($cached->{$cache_id}) {
print "CACHED($cache_id)\n" if $debug;
$newcache->{$cache_id} = $cached->{$cache_id};
push @shows, $cached->{$cache_id};
next;
}
print "NEW/CHANGED($cache_id)\n" if $debug;
# --- compute start and stop times based on row of table and
# rowspan --- although this appears problematic for some days
my $start;
$title =~ s/(\d):(\d)(am|pm)/$1:0$2$3/;
if ($title =~ s/\s* \[ ((\d+:\d+) \s* (am|pm)) \] \s* //x) {
$start = ParseDate("$1 $date");
} else {
$start = DateCalc($date6am, ($row*5) . " minutes");
}
my $stop = DateCalc($start, ($rowspan*5) . " minutes");
($start .= " $conf->{TZ}") =~ tr/://d;
($stop .= " $conf->{TZ}") =~ tr/://d;
my $show = { 'title' => [[$title, $lang]],
'start' => $start,
'stop' => $stop,
'channel' => $chanid{lc $channel} };
# --- fill in more details? ---
get_closeup_details($show,$pid) if $slow;
# --- that's it!
push @shows, $show;
$newcache->{$cache_id} = $show;
print Dumper($show) if $debug;
}
++ $row;
@span = map { $_ - 1} @span; # update rowspan counts
}
}
$tree->delete();
}
}
# --- now write to xml
print "writing xml\n";
my %writer_args = ( encoding => 'ISO-8859-1' );
if ($opt_output) {
my $fh = new IO::File(">$opt_output") or die "can't open $opt_output: $!";
$writer_args{OUTPUT} = $fh;
}
my $writer = new XMLTV::Writer(%writer_args);
$writer->start
( { 'source-info-url' => $NMSN,
'source-info-name' => "NMSN TV Guide",
'generator-info-name' => "XMLTV - tv_grab_au NMSN v$Revision"} );
$writer->write_channel( { 'display-name' => [[$_, $lang]],
'id' => $chanid{lc $_} } ) for sort keys %$channels;
$writer->write_programme($_) for @shows;
$writer->end();
# --- save seen data
$cached = $newcache;
open(CACHE, "> $cache_file") or die "cannot open $cache_file: $!";
print CACHE Data::Dumper->Dump([$cached], ["cached"]);
close CACHE;
# --- report runtime
$runtime = time() - $runtime;
my $min = int($runtime / 60);
my $sec = $runtime % 60;
printf "tv_grab_au: finished in %d:%d\n", $min, $sec;
exit 0; # Game over, man!
# ---------------------------------------------------------------------------
# --- subroutines
# --- get details from the closeup page for given show
sub get_closeup_details {
my $show = shift;
my $pid = shift;
my $title = $show->{title}->[0]->[0];
my $date = Ymd($show->{start});
my $details = get_details_page($pid) or return;
# --- use HTML::TreeBuilder to parse the details from the page...
my $tree = HTML::TreeBuilder->new_from_content($details);
# --- the details are in a two row table: first row is the header
# ('Time', 'Program'), second row is the body of the table containing
# the information we want.
my ($hd,$bd) = $tree->look_down
('_tag' => 'table', 'borderColor' => '#003366')->content_list();
# --- sanity check the header row
die "the HTML format of the program details page has changed"
unless $hd->as_text =~ /Time .* Program/x;
# --- remove breaks and "revert" some markup to simplify parsing
$_->delete for $bd->look_down('_tag' => 'br');
$_->replace_with_content for $bd->look_down('_tag' => qr/^(b|font)$/);
# --- extract (text) content lists of the 2 row cells
my @td;
for ($bd->look_down('_tag' => 'table')) {
push @td, grep { not ref($_) } $_->content_list
for $_->look_down('_tag' => 'td');
}
# --- clean fields up a little
s/(^\s+|\s+$)//g, s/\((.*?)\)/$1/g for @td;
# --- Here's an example of the contents of @td at this point:
# ("7:00 pm", "Southern Cross TEN Capital", "The All Time Greatest
# Simpsons", "- Cape Feare", "30 mins , Rated: PG", "Genre:",
# "Cartoon", "Sideshow Bob terrorises Bart after he is paroled
# from prison.", "" )
my ($start_time, $channel,
$title1, $title2, $genre, $desc) = @td[0,1,2,3,6,7];
my ($duration, $rating) = (split(/\s*,\s*/, $td[4]), "", "");
# --- is this a channel we know about? is it consistent with the guide?
$channel = $chanid{lc $channel}; # -- convert to xmltv channel id
die "channel mismatch for show: $title" unless $channel eq $show->{channel};
# --- now clean up a few things
$title2 =~ s/^\s*-\s*//;
$rating =~ s/Rated:\s*//;
$duration =~ s/mins/minutes/;
# --- compute start and stop times
my $start = ParseDate("$start_time $date");
$start = DateCalc($start, "+ 1 day") if $start_time =~ /^0?[0-5]:.. am/;
my $stop = DateCalc($start, "+ $duration");
# --- append timezone and strip colons
($start .= " $conf->{TZ}") =~ tr/://d;
($stop .= " $conf->{TZ}") =~ tr/://d;
$show->{'title'} = [[$title1, $lang]];
$show->{'start'} = $start;
$show->{'stop'} = $stop;
$show->{'sub-title'} = [[$title2, $lang]] if $title2;
$show->{'desc'} = [[$desc, $lang]] if $desc;
$show->{'category'} = [[$genre, $lang]] if $genre;
}
# --- configure: query for region, services, and channels and write config
sub configure {
my $config_file = shift;
my $date = dmY("today");
my $firstpage;
XMLTV::Config_file::check_no_overwrite($config_file);
# --- extract user's ids for channels (if there were any in
# the config file), add the defaults then clear the channels hash
$chanid{lc $_} = $channels->{$_} for keys %$channels;
for (channel_mappings()) {
my ($name, $id) = / \s* (.+?) \s* : \s* (\S+) /x or next;
$chanid{lc $name} = $id unless $chanid{lc $name}; # use user's if defined
}
$channels = {};
$conf = {};
# --- get timezone
$conf->{TZ} = ask("Please enter your timezone offset (default '+1000') :");
$conf->{TZ} = '+1000' unless $conf->{TZ} =~ /^ \s* \+\d\d\d\d \s* $/x;
my @channellist;
# --- now find list of services - note that this appears to be invariant,
# so perhaps we should always offer the same list and skip the get?
{
my %servicenames = ( free => 1);
$firstpage = with_progress("getting list of services",
sub { get_page("$TVTODAY?channel=free") });
++$servicenames{$1} while $firstpage =~ /channel=(\w+)/g;
my @choices = sort keys %servicenames;
my @flag = ask_many_boolean
(0, map { "Grab listings for $_ channels" } @choices);
for (0..$#choices) {
next unless $flag[$_];
push @{$conf->{services}}, { name => $choices[$_] };
}
# --- now loop over services; find region/regionid and list of
# channels
for my $service (@{$conf->{services}}) {
my ($page, $base);
if ($service->{name} eq 'free') {
# --- get list of regions
my %region;
my $tree = HTML::TreeBuilder->new_from_content($firstpage);
for ($tree->look_down('_tag' => 'select', 'name' => 'region')) {
for ($_->look_down('_tag' => 'option')) {
$region{$_->as_text()} = $_->attr('value');
}
}
$tree->delete();
my @choices = sort keys %region;
$service->{region}
= ask_choice("Select your region for free channels",
$choices[0], @choices);
$service->{regionid} = $region{$service->{region}};
$page = with_progress
( "getting list of channels free service in " .
"$service->{region}", sub { get_guide_page($service, $date) } );
} else {
# --- find regionid for service
($page, $base) = with_progress
( "getting regionid and channels for service $service->{name}",
sub { get_content_base("$TVTODAY?channel=$service->{name}") } );
$service->{region} = "Australia";
($service->{regionid})
= $base =~ /_(\d+).asp/ or die "cannot find regionid";
# page now has channel list too
}
# --- now append channels for this service
my %skip;
# --- find the channels
my $tree = HTML::TreeBuilder->new_from_content($page);
for ($tree->look_down('_tag' => 'table', 'class' => 'tv',
'width' => '100%') # only one table of this type
->look_down('_tag' => 'tr') # ..first row has channels
->look_down('_tag' => 'b')) { # ..in bold tags
my $channel = $_->as_text;
push @channellist, $channel;
unless ($chanid{lc $channel}) { # check/define xmltvid
my $id = lc($channel);
$id =~ s/( ^\s+ | \s+$ | \W )//gx;
$id .= ".$service->{name}.au"; # e.g., "foxtel.au", "free.au"
print "Warning, unknown channel '$channel', using '$id' as id\n";
$chanid{lc $channel} = $id;
}
}
$tree->delete();
}
}
my @select = ask_many_boolean
(1, map { "subscribe to channel $_ -> $chanid{lc $_}" } @channellist);
for (0..$#channellist) {
next unless $select[$_];
my $name = $channellist[$_];
$channels->{$name} = $chanid{lc $name};
}
my @channels = sort keys %$channels;
# --- does the user want the slow option turned on by default?
$conf->{slow} =
ask_boolean("Show descriptions, ratings, genres and more accurate\n" .
"time information is available by downloading individual\n" .
"pages for each show, but this takes a lot longer\n\n" .
"Do you want this (--slow) option to be on by default?");
# --- report configuration and ask for confirmation
my $channel_count = @channels;
my $services_info;
for my $service (@{$conf->{services}}) {
$services_info .= "service: name=$service->{name}, " .
" region=$service->{region} (id=$service->{regionid})\n";
}
die "aborting configuration" unless
ask_boolean( "Please confirm the following configuration:\n" .
" TZ = $conf->{TZ}\n" .
" $services_info\n" .
" ($channel_count subscribed channels)\n\n" .
"[ use the '--list-channels' option for the\n" .
" xmltvids to use in mythtvsetup ]\n\n" .
" Continue?\n");
# --- open config file and write the configuration
-d dirname($config_file) or mkdir dirname($config_file)
or die "cannot create directory for $config_file: $!";
# --- dump as perl code using Data::Dumper
open(CONF, ">$config_file") or die "cannot write to $config_file: $!";
print CONF Data::Dumper->Dump([$conf, $channels], ["conf", "channels"]);
close CONF;
print "wrote config_file: $config_file\n";
}
# --- we can avoid redirections by spoofing random MSIDs in the URLs
use Digest::MD5 qw{md5_hex};
sub MSID { $spoofMSID ? "&MSID=" . md5_hex(rand) : "" }
# --- get NMSN guide for given date (perversely "ddmmYYYY")
sub get_guide_page {
my $service = shift;
my $date = shift;
my $url = "$GUIDE/${date}_$service->{regionid}.asp?"
. "channel=$service->{name}" . MSID();
print "GET_GUIDE_PAGE($url)\n" if $debug;
my $page = get_page($url) or
print "Warning: Failed to get program listing for day $date\n";
return $page;
}
# --- get NMSN program details for given pid
sub get_details_page {
my $pid = shift;
my $url = "$DETAILS?pid=$pid" . MSID();
print "GET_DETAILS_PAGE($url)\n" if $debug;
my $page = get_page($url) or
print "Warning: Failed to get program details for pid $pid\n";
return $page;
}
# --- get a page
sub get_page {
my $url = shift;
my $page = (get_content_base($url))[0];
$page =~ s/ / /g;
return $page;
}
# --- get a page and its base (and report all redirections if debugging)
my $ua;
sub get_content_base {
my $url = shift;
$ua or $ua = LWP::UserAgent->new, $ua->timeout(30);
my $response = $ua->get($url);
die "error reading page $url" unless $response->is_success;
if ($debug and (my $r = $response)->previous) {
print "GET_CONTENT_BASE redirection backtrace:\n";
while ($r) { print " ", $r->base, "\n"; $r = $r->previous }
}
return $response->content, $response->base;
}
# --- show a progress message during call to code (given by closure)
sub with_progress {
my ($message, $sub) = @_;
my $bar = new XMLTV::ProgressBar($message, 1);
my @results = $sub->();
$bar->update, short_pause(), $bar->finish;
return wantarray ? @results : $results[0];
}
sub short_pause { select(undef, undef, undef, 0.33) }
# ---------------------------------------------------------------------------
# --- misc/pedantic stuff...
sub fixplural {
local $_ = shift;
s/(\d+) (\s+) (\w+)s (\s)/$1 . $2 . $3 . ($1==1?"":"s") . $4/xe;
$_
}
sub Ymd { UnixDate($_[0], "%Y%m%d") or die "problem in Ymd($_[0])" }
sub dmY { UnixDate($_[0], "%d%m%Y") or die "problem in dmY($_[0])" }
# ---------------------------------------------------------------------------
# --- here is default the channel list... comments welcome :-)
sub channel_mappings {
return grep ! /^#/, split "\n", qq{
# --- Free channels
ABC NSW : nsw.abc.gov.au
ABC QLD : qld.abc.gov.au
ABC TAS : tas.abc.gov.au
ABC ACT : act.abc.gov.au
ABC2 : abc2.abc.gov.au
Channel Seven Sydney : sydney.seven.com.au
Channel Seven Queensland : queensland.seven.com.au
Prime Southern : southern.prime.com.au
SBS Sydney : sydney.sbs.com.au
SBS Queensland : queensland.sbs.com.au
SBS News : news.sbs.com.au
SBS EASTERN : eastern.sbs.com.au
Network TEN Sydney : sydney.ten.com.au
Southern Cross TEN Capital : capital.southerncrossten.com.au
Southern Cross TEN Queensland : queensland.southerncrossten.com.au
Channel Nine Sydney : sydney.nine.com.au
WIN Television NSW : nsw.win.com.au
WIN Television QLD : qld.win.com.au
# --- Foxtel
Arena TV : arena.foxtel.com.au
BBC World : bbcworld.foxtel.com.au
Cartoon Network : cartoon.foxtel.com.au
Channel [V] : v.foxtel.com.au
CNBC : cnbc.foxtel.com.au
CNN : cnn.foxtel.com.au
Discovery Channel : discovery.foxtel.com.au
FOX News : foxnews.foxtel.com.au
FOX8 : fox8.foxtel.com.au
MAX : max.foxtel.com.au
National Geographic Channel : natgeo.foxtel.com.au
Nickelodeon : nickelodeon.foxtel.com.au
Showtime : showtime.foxtel.com.au
Showtime 2 : showtime2.foxtel.com.au
Sky News : skynews.foxtel.com.au
TV1 : tv1.foxtel.com.au
UKTV : uktv.foxtel.com.au
World Movies : worldmovies.foxtel.com.au
A1 : a1.foxtel.com.au
ACC : acc.foxtel.com.au
ADULTS ONLY : adultsonly.foxtel.com.au
ANIMAL PLANET : animalplanet.foxtel.com.au
ANTENNA PACIFIC : antennapacific.foxtel.com.au
ARENA+2 : arena2.foxtel.com.au
AURORA : aurora.foxtel.com.au
BLOOMBERG : bloomberg.foxtel.com.au
BOOMERANG : boomerang.foxtel.com.au
CLUB [V] : clubv.foxtel.com.au
CMC : cmc.foxtel.com.au
CRIME & INVESTIGATION NETWORK : crime.foxtel.com.au
DISCOVERY HEALTH : health.discovery.foxtel.com.au
DISCOVERY SCIENCE : science.discovery.foxtel.com.au
DISCOVERY TRAVEL & ADVENTURE : travel.discovery.foxtel.com.au
DISNEY CHANNEL : disney.foxtel.com.au
E! : e.foxtel.com.au
ESPN : espn.foxtel.com.au
EUROSPORT NEWS : eurosportnews.foxtel.com.au
FOOD : food.foxtel.com.au
FOX CLASSICS : classics.foxtel.com.au
FOX CLASSICS+2 : classics2.foxtel.com.au
FOX SPORTS 1 : sports1.foxtel.com.au
FOX SPORTS 2 : sports2.foxtel.com.au
FOX8+2 : fox82.foxtel.com.au
FTV : ftv.foxtel.com.au
FUEL : fuel.foxtel.com.au
HALLMARK CHANNEL : hallmark.foxtel.com.au
HOW TO : howto.foxtel.com.au
MAIN EVENT : mainevent.foxtel.com.au
MOVIE EXTRA : movieextra.foxtel.com.au
MOVIE GREATS : moviegreats.foxtel.com.au
MOVIE ONE : movieone.foxtel.com.au
MOVIE ONE TAKE 2 : movieonetake2.foxtel.com.au
MTV : mtv.foxtel.com.au
NICK JNR : nickjnr.foxtel.com.au
OVATION : ovation.foxtel.com.au
RAI INTERNATIONAL : rai.foxtel.com.au
SHOWTIME GREATS : showtimegreats.foxtel.com.au
SKY RACING : skyracing.foxtel.com.au
TCM : tcm.foxtel.com.au
THE BIOGRAPHY CHANNEL : biography.foxtel.com.au
THE COMEDY CHANNEL : comedy.foxtel.com.au
THE COMEDY CHANNEL+2 : comedy2.foxtel.com.au
THE HISTORY CHANNEL : history.foxtel.com.au
THE LIFESTYLE CHANNEL : lifestyle.foxtel.com.au
THE LIFESTYLE CHANNEL+2 : lifestyle2.foxtel.com.au
THE WEATHER CHANNEL : weather.foxtel.com.au
TV1+2 : tv12.foxtel.com.au
TVSN : tvsn.foxtel.com.au
UKTV+2 : uktv2.foxtel.com.au
VH1 : vh1.foxtel.com.au
W : w.foxtel.com.au
};
}
More information about the mythtv-users
mailing list