#!/usr/bin/perl ### ------------------------------------------------------------------------- ### Program: abcvidz.pl ### Author : Phill Edwards ### Version: 2.0 Date : Jul 2006 ### Vsn 2.0 is a rewrite to a) make it more efficient by only ### downloading 1 .asx file and b) to capture more program ### streams from ABC. ### ### This is a MythStream parser or harvester (not sure if there's a diff). ### It gets ABC Australia broadband videos which typically have ~ 7-8 ### stories, such as news, business wraps etc. ### It gets the info by getting the URL to the news stories from a file ### called selector.htm ### ### The Selector file contains vars which provide the destination URL of where ### the current content is. As at 1-Jul-06 these are examples: ### var ref="http://www.abc.net.au/atthemovies/broadband/20060628_2200/"; ### var ref="http://www.abc.net.au/tv/australiawide/video/20060630_1552/"; ### var ref="http://www.abc.net.au/tv/btn/broadband/20060627/"; ### var ref="http://www.abc.net.au/business/video/20060701_0815/"; ### var ref="http://www.abc.net.au/tv/chaser/war/broadband/20060630_2200/"; ### var ref="http://www.abc.net.au/dig/tv/video/20060626_2100/"; ### var ref="http://www.abc.net.au/health/minutes/video/vod/"; ### var ref="http://www.abc.net.au/news/hemispheres/vod/20060622_1038/"; ### var ref="http://www.abc.net.au/insidebusiness/vod/20060625_1309/"; ### var ref="http://www.abc.net.au/insiders/vod/20060625_1153/"; ### var ref="http://www.abc.net.au/lateline/vod/20060701_0026/"; ### var ref="http://www.abc.net.au/broadbandmediawatch/20060626_2120/"; ### var ref="http://www.abc.net.au/newinventors/broadband/20060628_2000/"; ### var ref="http://www.abc.net.au/broadbandnews/20060701_1349/"; ### var ref="http://www.abc.net.au/news/nibs/20060701_1442/"; ### var ref="http://www.abc.net.au/broadbandrage/20060630_2300/"; ### var ref="http://www.abc.net.au/rural/videonews/20060630_1221/"; ### var ref="http://www.abc.net.au/7.30/vod/20060629_2230/"; ### var ref="http://www.abc.net.au/sport/video/20060701_1013/"; ### var ref="http://www.abc.net.au/weather/video/20060701_0824/"; ### ### Once we know the above urls we can determine the url of the .asx file ### which contains the titles and MMS urls of the videos we're after. We can ### download this .asx file and read through it to extract titles and ### MMS urls which are output to MythStream in streams.res format. ### ### ### To use this MythStream parser/harvester add these lines to streams.res ### (without the ###'s!): ### [item] ### ABC Streams ### News ### http://127.0.0.1/dummy.html?prog=broadbandnews&vidqual=lq&output=0 ### ABC Broadband Videos ### abcvidz ### ### [item] ### ABC Streams ### Sports ### http://127.0.0.1/dummy.html?prog=hemispheres&vidqual=hq&output=0 ### ABC Broadband Videos ### abcvidz ### ### etc... ### ### ### Parameters: ### MythStream passes in the URL which is specified in streams.res ### so we get eg ### http://127.0.0.1/dummy.html?prog=news&vidqual=hq&output=0 as $ARGV[1] ### prog = category of video from ABC - ### eg broadbandnews, sport, weather, 7.30, rural, ### broadbandrage, nibs, newinventors, ### broadbandmediawatch, lateline, insiders, insidebusiness, ### hemispheres, health, dig, chaser, business, btn, ### australiawide, atthemovies ### ### vidqual=quality of video - hq=high (broadband) and lq=low (dialup) ### ### output=whether to write to file (1) or screen (0). Set to 0 when ### using this has a parser in mythstream. ### ### To run or test this from the cmd line run this command: ###abcvidz.pl BLAH http://127.0.0.1/dummy.html?prog=btn\&vidqual=hq\&output=0 ### ------------------------------------------------------------------------- use strict; use warnings; use LWP::Simple; # Reqd for get http command. ###---------------------------------------------------------------------- ### Set ABC domain and get name=value pairs from param passed in from ### streams.res URL line. ###---------------------------------------------------------------------- my $outfn = "/var/www/html/abcvidz/streams."; # Output filename #my $outfn = "./streams."; # Output filename for tesing my $domain = "http://www.abc.net.au/"; # ABC domain name my %params = get_params($ARGV[1]); my $prog = $params{'prog'}; # broadbandnews,sport,business etc my $vidqual = $params{'vidqual'}; # lq,hq=low,high bandwidth vids my $output = $params{'output'}; # 0=write to screen, 1=to file ###---------------------------------------------------------------------- ### STEP 1 - Call subroutine to get URL to use for the chosen program. ###---------------------------------------------------------------------- my $desturl = get_url(); ###---------------------------------------------------------------------- ### STEP 2 - Keep getting headlines until run out of stories and ### associate headlines with URLs for videos. Story headlines and MMS ### links are in the .ASX file associated with the story under "meta" dir: ### ### http://www.abc.net.au/broadbandnews/20060408_1327/meta/hq1.asx ### {------------------------------------------------} {---} ### desturl retrieved in STEP 1 vid qual ### ### desturl tells us where to look for video .ASX files (eg hq1.asx). ### Loop thro download ASX files one by one until there are no more. ### Pick out the headline and associate with a MMS video URL. ###---------------------------------------------------------------------- my $hdline; # Holds headline retrieved from storyN.htm my $mmsurl; # Holds MMS URL to video for headline my @hdlarr; # Array of headlines my @mmsarr; # Array of mms filenames my $ctr = 1; # Line counter ### Download hq1.asx - this contains all the headlines and MMS urls. my $asxfile = get($desturl."meta/".$vidqual."1.asx"); if (!defined($asxfile)) { print "abcvidz.pl: ERROR - no asxfile!\n"; exit 1; } else { ### Use split to break up the file which comes down as one big long line ### using as the split point so we get separate lines for each ### story. my @lines = split(//, $asxfile); ### Loop through each line, copying each line into the variable $line ### until we get the title line then extract the headline (and just the ### headline) from it. foreach my $line (@lines) { $hdline = $line; $hdline =~ s/.*$//; $hdline =~ s/^.*//; $hdline =~ s/<\/title>.*$//; ### Get rid of any strange control characters $hdline =~ s/[[:cntrl:]]//g; $hdlarr[$ctr] = $hdline; ### Generate a URL to MMS file containing video for the headline $mmsurl = $line; $mmsurl =~ s/><\/ref><banner>.*$//; $mmsurl =~ s/^.*<ref href=\"mms/mms/; $mmsurl =~ s/"//g; $mmsarr[$ctr] = $mmsurl; $ctr++; } # end if ($hdline) } # end for loopeach line loop } # end main if ###---------------------------------------------------------------------- ### STEP 3 - Print the headline and MMS filename out to a streams.res ### XML format file. ###---------------------------------------------------------------------- ### Print streams.res format XML output for MythStream. If outputting to a ### file we need to redirect STDOUT to the required filename. my $urldt = $desturl; # Date/time part of Dest URL $urldt =~ s/[[:alpha:][:punct:]]//g; $outfn .= "$prog.$urldt"; if ($output) { open STDOUT, "> $outfn" or die "Can't open file $outfn for writing" } ### Loop through the headline and MMS arrays to extract the data to print. print "<items>\n"; for ( my $arrctr=1; $arrctr<=$#hdlarr; $arrctr++ ) { ### Output in mythstream streams.res format print "<item>\n"; print " <name>$hdlarr[$arrctr]</name>\n"; print " <url>$mmsarr[$arrctr]</url>\n"; print " <descr></descr>\n"; print " <handler></handler>\n"; print "</item>\n"; } print "</items>\n"; ### If we had redirected STDOUT to a file, close it now. if ($output) { close STDOUT; } exit; # Exit program ###---------------------------------------------------------------------- ### Return the URL to use for the selected program. ### This lives in a file called selector.htm from abc.net.au. ###---------------------------------------------------------------------- sub get_url { my $url; # Destination URL eg /broadbandnews/20050810_2019. my $urlfile = get("http://www.abc.net.au/vod/selector.htm"); ### Use split to break up the string using the \n as the char to split ### into array called urlines my @urllines = split(/\n/, $urlfile); ### Loop through each line, copying each line into the variable $line foreach my $urlline (@urllines) { #print $urlline; if ($urlline =~ /^var ref.*\/$prog\//) { $url = $urlline; $url =~ s/^var ref=//; $url =~ s/"//g; $url =~ s/;//g; ### For some reason ctrl char ^M is at end of the var so get rid of it $url =~ s/[[:cntrl:]]//g; last; } } return $url; } ### ---------------------------------------------------------------------- ### Get the cmd line param passed to it and parse the name=value pairs ### into a hash array which is returned to the calling routine. ### Should split eg prog=news&vidqual=hi&output=1 into a hash table with ### keys of "prog", "vidqual", "output" and corresponding values of ### "news", "hq", "1". ### ---------------------------------------------------------------------- sub get_params { my %parr; my ($varname, $varval, $param1); ### Remove the http part up to the GET string from parameter $param1 = $_[0]; $param1 =~ s/^http.*\?//; my @arr = split(/\&/, $param1); foreach my $pair (@arr) { ($varname, $varval) = split(/=/, $pair); $parr{$varname} = $varval; } return %parr; }