[mythtv-users] Bell Expressvu PPV Scraper script
Andrew Saunders
saunders at pagpaintball.com
Wed May 10 18:03:29 EDT 2006
For some reason it didn't post my attachment the first time. A few
people emailed so I'll repost it again. The info on this is back here
http://www.gossamer-threads.com/lists/mythtv/users/203199#203199
<?php
/*
Bell ExpressVu PPV Scraper for MythTV v0.19 - 0.19.1 svn
by Andrew Saunders (saunders at pagpaintball.com)
ChangeLog:
April 29, 2006 - v0.2 - minor fixes plus updates for the minor
changes to the guide.
April 24, 2006 - v0.1 - fully working version
Installation:
step 1 - this requires the CLI version of PHP5 (may work with
4), must include mysqli and cURL support.
step 2 - select which channels you want from labs.zap2it.com or
another source and run "mythfilldatabase".
step 3 - set config info (below) and select which PPV channels
you wish to update from below.
step 4 - run from either a web browser or commandline (use
browser if debugging).
step 5 - enjoy having ppv info in your guide.
ToDo:
- experiment with curl_multi to grab multiple pages at once.
=== NOTE : THIS IS A BETA RELEASE, IT WORKS FOR ME, YMMV! ===
*/
/*
------------------------------------------------------------------------
CONFIG
------------------------------------------------------------------------ */
$debugLevel = 4; // 0=no output, 1=minimal, 2=verbose,
(3&4 debugging only!) 3=include runtime variables, 4=all runtime info
$daysToScrape = 2; // how many days to scrape (read note
above first)
$timezone = 'AST'; // options are : PST MST CST EST AST NEWF
$mysqlName = 'root'; // mysql name
$mysqlPass = ''; // mysql password
$mysqlDB = 'mythconverg'; // mythtv db name
$mysqlAddr = 'localhost'; // db address
$sourceID = 3; // sourceID to update (find this in
mythweb > edit settings > channel info)
$channelsToScrape = array();
// uncomment for which channels you want scraped (for speed's sake
please only select what you watch and if it's in season)
// french PPV
//array_push($channelsToScrape, 156, 157, 161, 162, 163, 164, 165,
166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177);
// english PPV
array_push($channelsToScrape, 351, 352, 353, 354, 355, 356, 357,
358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371,
372, 373, 374, 375, 376, 377, 378, 379, 380, 381);
// soccer
//array_push($channelsToScrape, 403);
// nhl
//array_push($channelsToScrape, 425, 426, 427, 428, 429, 430, 431,
432, 433, 434, 435, 436, 437, 438);
// nascar
//array_push($channelsToScrape, 440, 441, 442, 443, 444, 445, 446);
// nfl
//array_push($channelsToScrape, 451, 452, 453, 454, 455, 456, 457,
458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468);
// hpi tv
//array_push($channelsToScrape, 475, 476, 477);
// cricket
//array_push($channelsToScrape, 703);
// poland
//array_push($channelsToScrape, 711);
// kids PPV
array_push($channelsToScrape, 560, 561);
// venus PPV
//array_push($channelsToScrape, 749, 750, 751, 752, 753, 754, 755,
756, 757, 758, 759, 760);
// HD PPV
//array_push($channelsToScrape, 830, 831, 832, 833);
/*
------------------------------------------------------------------------
DO NOT EDIT BELOW THIS LINE UNLESS YOU KNOW WHAT YOUR DOING
------------------------------------------------------------------------ */
set_time_limit(60 * 60); // 1hr max runtime for script to finish
// setup mysql
$dbi = new mysqli($mysqlAddr, $mysqlName, $mysqlPass, $mysqlDB) or
die('Could not connect: ' . mysql_error());
$stmt = $dbi->stmt_init();
// setup cURL
$ch = curl_init();
curl_setopt($ch, CURL_HTTP_VERSION_1_1, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');
curl_setopt($ch, CURLOPT_USERAGENT, 'User-Agent: Mozilla/5.0
(Windows; U; Windows NT 5.1; en-US; rv:1.8.0.2) Gecko/20060308
Firefox/1.5.0.2');
// uncomment below to generate a new cookie every time
curl_setopt($ch, CURLOPT_COOKIEJAR, '-');
// or uncomment below to save cookies between instances
//$cookieFile = '/dir/to/xvu_cookie.txt';
//curl_setopt($ch, CURLOPT_COOKIEJAR, $cookieFile);
//curl_setopt($ch, CURLOPT_COOKIEFILE, $cookieFile);
// uncomment below for extra debugging info
//curl_setopt($ch, CURLOPT_VERBOSE, true);
//curl_setopt($ch, CURLOPT_HEADER, true);
$curlMaxRetries = 10; // maximum number of times it will
retry to get an existing page
$curlMaxAbort = 20; // maximum number of retries
before failing out
$commandLine = isset($argv); // true/false if running from the
commandline
/*
------------------------------------------------------------------------
RETRIEVE ALL THE PPV IDS
------------------------------------------------------------------------ */
$currentDay = 0;
$currentHour = 1;
$ppvIDArray = array();
// get initial cookie
$html = false;
$retries = 0;
while (($html === false) && (++$retries <= $curlMaxRetries)) {
checkRetries();
// site returned nothing. hit it again
printDebugInfo('retreiving initial page for cookie :: attempt '
. $retries . ' / ' . $curlMaxRetries, 1);
curl_setopt($ch, CURLOPT_URL,
'http://www.bell.ca/ExpressVuEPG/loadVuGuide.do?lang=en');
$html = curl_exec($ch);
printDebugInfo($html, 4);
if (preg_match("/Sorry, due to technical difficulties this
function is not available/", $html)) {
//Sorry, due to technical difficulties this function is not
available at this time. Please try again later.
printDebugInfo('guide is down, try again later...', 1);
//exit();
$html = false;
}
}
if ($html === false) {
printDebugInfo('unable to retrieve initial page, aborting...', 1);
exit();
}
// caculate the starting/ending channel
$startingChannel = 1000;
$endChannel = 0;
foreach($channelsToScrape as $junk => $channel) {
if ($channel < $startingChannel) {
$startingChannel = $channel;
}
if ($channel > $endChannel) {
$endChannel = $channel;
}
}
while ($currentDay < $daysToScrape) {
$pageDown = false;
do {
$html = false;
$retries = 0;
while (($html === false) && (++$retries <= $curlMaxRetries)) {
if (!$pageDown) {
printDebugInfo('retrieving list of PPVs :: ' .
'day ' . ($currentDay + 1) . ' / ' . $daysToScrape .
', hour ' . date('H:i',
mktime((($currentHour+1)/2),((($currentHour-1)%2)*30),0,1,1,2000)) .
' - ' . date('H:i',
mktime((($currentHour+1+4)/2),((($currentHour-1)%2)*30),0,1,1,2000)) .
', attempt ' . $retries . ' / ' .
$curlMaxRetries, 1);
} else {
printDebugInfo('retrieving next page of channels ::
attempt '. $retries . ' / ' . $curlMaxRetries, 2);
}
checkRetries();
if (!$pageDown) {
// send POST, follow redirect (automatically) and
get page
// selectedStartTime :: 1=1am, 2=1:30am, 3=2:00am,
..., 46=11:30pm, 47=12:00am, 48=12:30am
// selectedDay :: 0=current day, 1=next day, 2=two
days later, ..., 13=...
// gotoChannel :: starting channel
curl_setopt($ch, CURLOPT_URL,
'http://www.bell.ca/ExpressVuEPG/submitSearchFilter.do');
curl_setopt($ch, CURLOPT_REFERER,
'http://www.bell.ca/ExpressVuEPG/loadVuGuide.do?lang=en');
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS,
'favID=&favName1=&favName2=&favName3=&favName4=&favKeyword1=&favKeyword2=&favKeyword3=&favKeyword4=&favNetwork1=&favNetwork2=&favNetwork3=&favNetwork4=&favTheme1=&favTheme2=&favTheme3=&favTheme4=&selectedTheme=1%2C0%2C0%2C0%2C0%2C0%2C0%2C0&progID=&progTZ=&selectedKeyword=&selectedNetwork=&cbAll=on&orderbyName=&orderbyNo=asc&searchWin=1&userTimeZone=0&selectedFavName=&selectedDay='
. $currentDay . '&selectedStartTime=' . $currentHour .
'&selectedTimeZone=' . $timezone . '&gotoChannel=' . $startingChannel .
'&buttonPressed.x=16&buttonPressed.y=8');
$html = curl_exec($ch);
printDebugInfo($html, 4);
} else {
// send command to move down a page
curl_setopt($ch, CURLOPT_URL,
'http://www.bell.ca/ExpressVuEPG/submitChangeView.do?buttonPressed=DOWN');
curl_setopt($ch, CURLOPT_REFERER,
'http://www.bell.ca/ExpressVuEPG/submitSearchFilter.do');
curl_setopt($ch, CURLOPT_POST, false);
$html = curl_exec($ch);
printDebugInfo($html, 4);
}
if (preg_match("/Sorry, due to technical difficulties
this function is not available/", $html)) {
printDebugInfo('guide is down, trying again', 2);
$html = false;
}
}
$html = preg_replace("/([\r\n]| )/m", "", $html); //
remove newlines and junk
$html = preg_replace("/^.*?start of dynamic rows(.*?)end of
dynamic rows.*$/", "$1", $html); // trim
$html = preg_replace("/[ \t]*([<>])[ \t]*/", "$1",
$html); // tighten
$htmlArray = preg_split("/<\/tr>/", $html); // now split
up this way via </tr>
foreach($htmlArray as $htmlPart) {
if (preg_match("/<td bgcolor=\"#E5F2F8\" width=\"4%\"
align=\"center\">\d+<\/td>/", $htmlPart)) {
$channel = preg_replace("/^.*?<td
bgcolor=\"#E5F2F8\" width=\"4%\" align=\"center\">(\d+)<\/td>.*$/",
"$1", $htmlPart);
if ($channel > $endChannel) {
// no need to go any further
//continue 2;
}
printDebugInfo('channel : ' . $channel, 3);
if (in_array($channel, $channelsToScrape)) {
// if this is a channel we want then rip out ppv
links
unset($matches);
preg_match_all("/javascript:popupwin(?:PPV)?\(\'(\d+)\',/m", $htmlPart,
$matches);
printDebugInfo('ppv ids from page ($matches)', 3);
printDebugInfo($matches, 3);
foreach($matches[1] as $match) {
// this will automatically force uniqueness
and set the movie to the first day found
if (!isset($ppvIDArray[$match])) {
// annoyingly I have to set the day
manually as it isn't _anywhere_ on the info page...
if ($currentHour >= 47) {
// if it's past midnight it's the
next day
$ppvIDArray[$match] = array('day' =>
($currentDay + 1));
} else {
$ppvIDArray[$match] = array('day' =>
$currentDay);
}
}
}
}
}
}
$pageDown = true; // start heading down through the pages
} while
(preg_match("/javascript:popupwin(?:PPV)?\(\'(\d+)\',/m", $html)); //
while PPVs still listed
// adjust time
$currentHour += 4; // add 2 hours (2 hours listed per page)
if ($currentHour > 48) {
// past the maximum 48 so roll to the next day
$currentHour = $currentHour % 48;
$currentDay++;
}
}
printDebugInfo('all ppv ids found ($ppvIDArray)', 3);
printDebugInfo($ppvIDArray, 3);
printDebugInfo('finished retrieving PPV list, ' . count($ppvIDArray)
. ' PPVs found', 1);
/*
------------------------------------------------------------------------
RETRIEVE ALL INDIVIDUAL PPV INFORMATION
------------------------------------------------------------------------ */
foreach($ppvIDArray as $ppvID => $ppvInfo) {
printDebugInfo('retrieving PPV ' . $ppvID, 2);
curl_setopt($ch, CURLOPT_POST, false);
curl_setopt($ch, CURLOPT_REFERER,
'http://www.bell.ca/ExpressVuEPG/submitSearchFilter.do');
curl_setopt($ch, CURLOPT_URL,
'http://www.bell.ca/ExpressVuEPG/vuDetails.do?code=' . $ppvID .
'&tzcode=' . $timezone);
$html = false;
$retries = 0;
while (($html === false) && (++$retries <= $curlMaxRetries)) {
checkRetries();
$html = curl_exec($ch);
printDebugInfo($html, 4);
}
$html = preg_replace("/([\r\n]| )/m", "", $html); //
remove newlines and junk
$html = preg_replace("/^.*?<Body(.*?)Your Bell ExpressVu
PIN.*$/", "$1", $html); // trim
$html = preg_replace("/[ \t]*([<>])[ \t]*/", "$1", $html); //
tighten
// rip out info
$title = preg_replace("/^.*?<td valign=\"top\"
align=\"left\"><div class=\"bigblueBoldText\">(.*?)<\/div>.*$/m", "$1",
$html);
$channel = preg_replace("/^.*?>Channel: .*? - (\d+).*$/m", "$1",
$html);
$startTime = preg_replace("/^.*?>Start Time: (\d?\d:\d\d .M)
.*$/m", "$1", $html);
$endTime = preg_replace("/^.*?>End Time: (\d?\d:\d\d .M) .*$/m",
"$1", $html);
$description = $rating = $cost = '';
if (preg_match("/>Description of the show:</", $html)) {
$description = preg_replace("/^.*?>Description of the
show:<\/div><div class=\"blueText\">(.*?)<\/div>.*$/m", "$1", $html);
$description =
preg_replace("/^\(\d{2}:\d{2}[ap]m[^\)]*?\)(.*)$/m", "$1",
$description); // remove useless date from description
}
if (preg_match("/>Rating:</", $html)) {
$rating = preg_replace("/^.*?>Rating:<\/div><div
class=\"blueText\">([^<]*?)<\/div>.*$/m", "$1", $html);
$rating = preg_replace("/([ ]+,)/", "", $rating); //
clean up junk
}
if (preg_match("/>[\$](\d+\.\d\d)</", $html)) {
$cost = preg_replace("/^.*?<div
class=\"blueText\">[\$](\d+\.\d\d)<\/div>.*$/m", "$1", $html);
}
if (in_array($channel, $channelsToScrape)) {
// if we want this channel then insert into array
$ppvIDArray[$ppvID] = array (
'day' => $ppvInfo['day'],
'title' => html_entity_decode($title),
'channel' => $channel,
'starttime' => $startTime,
'endtime' => $endTime,
'description' => html_entity_decode($description),
'rating' => html_entity_decode($rating),
'cost' => $cost );
} else {
unset($ppvIDArray[$ppvID]);
}
printDebugInfo("$title :: $channel :: $startTime :: $endTime ::
$description :: $rating :: $cost", 3);
}
printDebugInfo('all info to be inserted into myth ($ppvIDArray)', 3);
printDebugInfo($ppvIDArray, 3);
/*
------------------------------------------------------------------------
SAVE PPV INFO TO MYTHTV DATABASE
------------------------------------------------------------------------ */
printDebugInfo('inserting ppv info into myth', 1);
// get chanid for each individual channel
printDebugInfo('retrieving channel info from myth db', 3);
$channelsInMyth = array();
foreach($channelsToScrape as $junk => $channel) {
$sql = 'SELECT chanid, channum FROM channel WHERE sourceid = ?
AND channum = ?';
if ($stmt->prepare($sql)) {
$stmt->bind_param('ii', $sourceID, $channel);
$stmt->execute();
$stmt->store_result(); // buffer everything
}
if ($stmt->errno) {
die($stmt->error);
} else {
$stmt->bind_result($chanid, $channum);
$stmt->fetch();
if ($channum) {
$channelsInMyth[$channum] = $chanid;
}
}
$stmt->free_result();
}
// delete all channel lineups
printDebugInfo('deleting all previous info from ppv channels', 3);
$sqls = array(
'DELETE FROM program WHERE chanid = ?',
'DELETE FROM programgenres WHERE chanid = ?',
'DELETE FROM programrating WHERE chanid = ?');
foreach($channelsInMyth as $channum => $chanid) {
foreach ($sqls as $sql) {
if ($stmt->prepare($sql)) {
$stmt->bind_param('i', $chanid);
$stmt->execute();
}
if ($stmt->errno) {
die($stmt->error);
}
$stmt->free_result();
}
}
printDebugInfo('inserting individual ppv info', 3);
// insert into myth
foreach($ppvIDArray as $ppvID => $ppvInfo) {
if (isset($channelsInMyth[$ppvInfo['channel']])) {
$chanid = $channelsInMyth[$ppvInfo['channel']];
$currentDate = date('Y-m-d', mktime(0, 0, 0, date("m"),
date("d")+$ppvInfo['day'], date("Y")));
$starttime = date('Y-m-d G:i:s', (strtotime($currentDate . '
' . $ppvInfo['starttime'])));
if ((strtotime($currentDate . ' ' . $ppvInfo['starttime']))
> (strtotime($currentDate . ' ' . $ppvInfo['endtime']))) {
// the endtime lands on the next day
$currentDate = date('Y-m-d', mktime(0, 0, 0, date("m"),
date("d")+$ppvInfo['day']+1, date("Y")));
}
$endtime = date('Y-m-d G:i:s', (strtotime($currentDate . ' '
. $ppvInfo['endtime'])));
$genre = 'PPV'; // acceptable?
$closecaptioned = 0;
$stars = 0;
$stereo = 1;
$title = $ppvInfo['title'];
$description = $ppvInfo['description'];
if ($ppvInfo['rating']) {
$description .= "\r\n" . 'Rating: ' . $ppvInfo['rating'];
}
if ($ppvInfo['cost']) {
$description .= "\r\n" . 'Cost: $' . $ppvInfo['cost'];
}
$channum = $ppvInfo['channel'];
// try and detect some ratings
if (preg_match("/\(G\)/", $ppvInfo['rating'])) {
$rating = 'G';
} elseif (preg_match("/\(PG\)/", $ppvInfo['rating'])) {
$rating = 'PG';
} elseif (preg_match("/\(R\)/", $ppvInfo['rating'])) {
$rating = 'R';
} else {
$rating = 'NR';
}
// insert into sql
$sql = 'INSERT INTO program (starttime, endtime, chanid,
category, closecaptioned, stars, stereo, title, description) VALUES (?,
?, ?, ?, ?, ?, ?, ?, ?)';
if ($stmt->prepare($sql)) {
$stmt->bind_param('ssisidiss', $starttime, $endtime,
$chanid, $genre, $closecaptioned, $stars, $stereo, $title, $description);
$stmt->execute();
}
if ($stmt->errno) {
//die($stmt->error);
printDebugInfo('mysql : ' . $stmt->error, 1);
}
$stmt->free_result();
$sql = 'INSERT INTO programgenres (starttime, chanid, genre)
VALUES (?, ?, ?)';
if ($stmt->prepare($sql)) {
$stmt->bind_param('sis', $starttime, $chanid, $genre);
$stmt->execute();
}
if ($stmt->errno) {
//die($stmt->error);
printDebugInfo('mysql : ' . $stmt->error, 1);
}
$stmt->free_result();
$sql = 'INSERT INTO programrating (starttime, chanid,
rating) VALUES (?, ?, ?)';
if ($stmt->prepare($sql)) {
$stmt->bind_param('sis', $starttime, $chanid, $rating);
$stmt->execute();
}
if ($stmt->errno) {
//die($stmt->error);
printDebugInfo('mysql : ' . $stmt->error, 1);
}
$stmt->free_result();
}
}
printDebugInfo('done...', 1);
curl_close($ch);
exit();
// done
/*
------------------------------------------------------------------------
FUNCTIONS
------------------------------------------------------------------------ */
/**
* output to either web browser or console
*
* @param string/array $text
*/
function printDebugInfo($text, $textDebugLevel) {
global $debugLevel, $commandLine;
if ($textDebugLevel <= $debugLevel) {
if ($commandLine) {
// running from command line
if (is_array($text)) {
print_r($text) . "\n";
} else {
echo $text . "\n";
}
} else {
// running from a browser
if ($textDebugLevel == 3) {
echo '<div style="margin: 2px 0; padding-left:10px;
background-color:#EEEEEE; border:1px solid #0000FF;">';
} elseif ($textDebugLevel == 4) {
echo '<div style="margin: 2px 0; padding-left:10px;
background-color:#EEEEEE; border:1px solid #FF0000;">';
}
if (is_array($text)) {
echo '<pre>';
print_r($text);
echo '</pre><br />';
} else {
echo nl2br(htmlentities($text)) . '<br />';
}
if ($textDebugLevel >= 3) {
echo '</div>';
}
flush(); // send to browser immediately
}
}
}
/**
* keep track of how many "retrys", a function just to keep code clean
*
*/
function checkRetries() {
global $retries, $curlMaxAbort;
if ($retries > 1) {
if ($curlMaxAbort-- == 0) {
printDebugInfo('too many pages failed, aborting...', 1);
exit();
}
// a page failed so wait 3 seconds
printDebugInfo('page retrieval failed, waiting 3 seconds
before retrying', 3);
//sleep(3);
}
}
?>
More information about the mythtv-users
mailing list