[mythtv] [PATCH] MythVideo allocine grabber
Xavier Hervy
maxpower44 at tiscali.fr
Thu Oct 7 09:30:38 UTC 2004
The allocine web site have change (used in mythvideo/script/allocine.pl)
I send the patch for allocine.pl and the new allocine.pl script
Thx to apply it.
Xavier
-------------- next part --------------
A non-text attachment was scrubbed...
Name: allocine.pl
Type: application/x-tv-program-info
Size: 13114 bytes
Desc: not available
Url : http://mythtv.org/pipermail/mythtv-dev/attachments/20041007/d5176435/allocine-0001.bin
-------------- next part --------------
--- /mnt/oldlocal/src/myth-0.14-06-03-2004/mythvideo/mythvideo/scripts/allocine.pl 2004-03-02 18:57:54.000000000 +0100
+++ allocine.pl 2004-10-07 11:31:26.788230680 +0200
@@ -13,26 +13,28 @@
use LWP::Simple; # libwww-perl providing simple HTML get actions
use HTML::Entities;
use URI::Escape;
-#use utf8;
-use vars qw($opt_h $opt_r $opt_d $opt_i $opt_v $opt_D $opt_M $opt_P);
-use Getopt::Std;
+
+use vars qw($opt_h $opt_r $opt_d $opt_i $opt_v $opt_D $opt_M $opt_P $opt_u $opt_originaltitle $opt_casting);
+use Getopt::Long;
$title = "Allocine Query";
-$version = "v1.00";
+$version = "v2.00";
$author = "Xavier Hervy";
# display usage
sub usage {
- print "usage: $0 -hdrviMPD [parameters]\n";
- print " -h help\n";
- print " -d debug\n";
- print " -r dump raw query result data only\n";
- print " -v display version\n";
- print " -i display info\n";
+ print "usage: $0 -hviuocMPD [parameters]\n";
+ print " -h, --help help\n";
+ print " -v, --version display version\n";
+ print " -i, --info display info\n";
+ print " -u, --utf8 output in utf8\n";
+ print " -o, --originaltitle concatenate title and original title\n";
+ print " -c, --casting with -D option, grap the complete actor list (much slower)\n";
print "\n";
- print " -M <query> get movie list\n";
- print " -D <movieid> get movie data\n";
+ print " -M <query>, --movie query> get movie list\n";
+ print " -D <movieid>, --data <movieid> get movie data\n";
+ print " -P <movieid>, --poster <movieid> get movie poster\n";
exit(-1);
}
@@ -61,11 +63,7 @@
my $start = index($ldata, lc($beg)) + length($beg);
my $finish = index($ldata, lc($end), $start);
- #my $ldata = $data;
- #my $start = index($ldata, $beg) + length($beg);
- #my $finish = index($ldata, $end, $start);
-
- #print "$start $finish\n";
+
if ($start != (length($beg) -1) && $finish != -1) {
my $result = substr($data, $start, $finish - $start);
# dont use decode entities &npsp; => spécial characters bug in html::entities ?
@@ -115,18 +113,23 @@
my $request = "http://www.allocine.fr/film/fichefilm_gen_cfilm=" . $movieid . ".html";
if (defined $opt_d) { printf("# request: '%s'\n", $request); }
my $response = get $request;
- if (defined $opt_r) { printf("%s", $response); }
# parse title and year
- my $title = parseBetween($response, "<FONT Class=\"titrePage\">", "</FONT>");
+ my $title = parseBetween($response, "<title>", "</title>");
+ my $original_title = parseBetween($response, "<h4>Titre original : ","</h4><br />");
+ $original_title = removeTag($original_title);
+ if (defined $opt_originaltitle){
+ if ($original_title ne ""){
+ $title = $title . " (" . $original_title . ")";
+ }
+ }
+
#print "titre = $title\n";
$title = removeTag($title);
- #print "titre ap = $title\n";
- my $year = parseBetween($response,"<TD><font >","</font>.");
- $year = parseBetween($year,"<font >(",")");
+ my $year = parseBetween($response,"<h4>Année de production : ","</h4>");
# parse director
- my $director = parseBetween($response," par </FONT>","</TD></TR>");
+ my $director = parseBetween($response,"<h4>Réalisé par ","</h4>");
$director = removeTag($director);
# parse writer
@@ -135,16 +138,14 @@
$writer = parseBetween($writer, "/\">", "</");
# parse plot
- my $plot = parseBetween($response,"<DIV Align='Justify'><FONT class=\"size2\">","</FONT></DIV>");
+ my $plot = parseBetween($response,"<td valign=\"top\" style=\"padding:10 0 0 0\"><div align=\"justify\"><h4>","</h4></div></td>");
#$plot.replace("<BR>","\n");
$plot = removeTag($plot);
# parse user rating
- #my $userrating = parseBetween($response, ">User Rating:</b>", "> (");
- #$userrating = parseBetween($userrating, "<b>", "/");
my $userrating;
- my $rating = parseBetween($response,"Presse","</TABLE>");
+ my $rating = parseBetween($response,"Presse</a> <img ", " border=\"0\" /></h4></td>");
my $nbvote = 0;
my $sommevote = 0;
$rating = parseBetween($rating,"etoile_",".gif");
@@ -152,12 +153,12 @@
$sommevote += $rating - 1;
$nbvote ++;
}
- $rating = parseBetween($response,"Spectateurs","</TABLE>");
+ $rating = parseBetween($response,"Spectateurs</a> <img ", " border=\"0\" /></h4></td>");
$rating = parseBetween($rating,"etoile_",".gif");
if (!($rating eq "")){
$sommevote += $rating - 1;
$nbvote ++;
- }
+ }
if ($nbvote==0){$userrating=0};
if ($nbvote==1){$userrating=$sommevote*2;};
if ($nbvote==2){$userrating=$sommevote;};
@@ -175,7 +176,7 @@
# parse movie length
- my $runtime = parseBetween($response,">Dur","</FONT>");
+ my $runtime = parseBetween($response,"Durée : ",".</h4> ");
my $heure;
my $minutes;
($heure,$minutes)=($runtime=~/[^\d]*(\d+)[^\d]*(\d*)/);
@@ -191,25 +192,53 @@
# parse cast
- my $cast = parseBetween($response, "<FONT Class=\"titreDescription\">Avec ", "</TD></TR>");
-
-# $cast = parseBetween($response, "<FONT Class=\"titreDescription\">Avec ", "Plus");
+ my $cast = parseBetween($response, "<h4>Avec ","</h4><br />");
$cast = removeTag($cast);
- if (parseBetween($cast,"", "Plus") eq ""){
- }else{
- $cast = parseBetween($cast,"", "Plus");
+ if (defined $opt_casting){
+ my $responsecasting = get "http://www.allocine.fr/film/casting_gen_cfilm=" . $movieid . ".html";
+ my $fullcast = parseBetween($responsecasting, "Acteur(s)", "<table");
+ my $oneactor;
+ $fullcast = parseBetween($fullcast,"style=\"background-color", "</table>");
+ my @listactor = split("style=\"background-color", $fullcast);
+ my @cleanlistactor ;
+ for $oneactor (@listactor ) {
+ $oneactor = parseBetween($oneactor,"<h4>","</h4>");
+ $oneactor = removeTag($oneactor );
+ push(@cleanlistactor,$oneactor);
+ }
+ my $finalcast = join (", ", at cleanlistactor);
+ if ($finalcast ne "") {$cast = $finalcast;};
}
+
+
#genres
- my $genres = parseBetween($response,")</font>. ","</FONT>.");#. <FONT Class=\"size2\">Dur");
+ my $genres = parseBetween($response,"<h4>Genre : ","</h4>");
$genres = removeTag($genres);
#countries
- my $countries = parseBetween($response,"</font> "," <font >(");
+ my $countries = parseBetween($response,"<h4>Film ",".</h4> ");
$countries = removeTag($countries);
-
+
+ if (defined $opt_u) {
+ utf8::encode($title);
+ utf8::encode($year);
+ utf8::encode($director);
+ utf8::encode($plot);
+ utf8::encode($userrating);
+ utf8::encode($movierating);
+ utf8::encode($runtime);
+ utf8::encode($writer);
+ utf8::encode($cast);
+ utf8::encode($genres);
+ utf8::encode($countries);
+ }
+
# output fields (these field names must match what MythVideo is looking for)
print "Title:$title\n";
+ if (!(defined $opt_originaltitle)){
+ print "OriginalTitle:$original_title\n";
+ }
print "Year:$year\n";
print "Director:$director\n";
print "Plot:$plot\n";
@@ -228,25 +257,24 @@
if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);}
# get the search results page
- my $request = "http://www.allocine.fr/film/galerie_gen_cfilm=" . $movieid . "&big=1&page=1.html";
+
+ my $request = "http://www.allocine.fr/film/galerie_gen_cfilm=" . $movieid . ".html";
if (defined $opt_d) { printf("# request: '%s'\n", $request); }
my $response = get $request;
- if (defined $opt_r) { printf("%s", $response); }
+ my $page=parseBetween($response,"page=",".html\" class=\"link1\"><span class=\"text2\">Dernière photo");
+ my @pages = split ("page=",$page);
+ $request = "";
+ for $page (@pages ) {
+ $request = $page;
+ }
+ if (!($request eq "")) {
+ $request = "http://www.allocine.fr/film/galerie_gen_cfilm=" . $movieid . "&page=" . $request . ".html";
+ $response = get $request;
+ }
- my $uri = parseBetween($response, "Ko'></A><BR><CENTER>", "<BR></CENTER></TD></TR></TABLE>");
- if ($uri eq ""){
- $uri = parseBetween($response, "<A HREF=\"/film/galerie_gen_cfilm=" . $movieid . ".html\" target=\"\"><IMG border=0 src=\"", "\" Zalt");
- if ($uri eq "") {
- my $request = "http://www.allocine.fr/film/fichefilm_gen_cfilm=" . $movieid . ".html";
- if (defined $opt_d) { printf("# request: '%s'\n", $request); }
- my $response = get $request;
- if (defined $opt_r) { printf("%s", $response); }
- $uri = parseBetween($response, "<TABLE Border=0 CellPadding=0 CellSpacing=0><TR><TD><IMG Src='", "' Border=0></TD><TD><IMG Border=0 Src='");
-
- }
- }
-
-
+
+ my $uri = parseBetween($response,"<table style=\"padding:0 0 0 0\" border=\"0\" >","Ko\" />");
+ $uri = parseBetween($uri ,"<img src=\"","\" border=\"0\" class=\"galerie\" ");
print "$uri\n";
}
@@ -289,24 +317,18 @@
while (($typerecherche <=5) && ($count ==0)){
# get the search results page
- my $request = "http://www.allocine.fr/recherche/rubrique.html?typerecherche=$typerecherche&motcle=$query";
+ my $request = "http://www.allocine.fr/recherche/?rub=1&motcle=$query";
if (defined $opt_d) { printf("# request: '%s'\n", $request); }
my $response = get $request;
- if (defined $opt_r) {
- print $response;
- exit(0);
- }
# extract possible matches
# possible matches are grouped in several catagories:
# exact, partial, and approximate
- my $exact_matches = parseBetween($response, "<TABLE width=96% cellPadding=0 cellSpacing=2 border=0>",
- "</TABLE>");
- #print "$exact_matches\n";
+ my $exact_matches = $response;
# parse movie list from matches
- my $beg = "<A HREF=\"/film/fichefilm_gen_cfilm=";
- my $end = "</TD></TR>";
+ my $beg = "<h4><a href=\"/film/fichefilm_gen_cfilm=";
+ my $end = "</a></h4>";
my @movies;
@@ -323,9 +345,8 @@
while ($start != -1) {
$start += length($beg);
my $sub = substr($data, $start, $finish - $start);
- my ($movienum, $moviename) = split(".html\">", $sub);
- $title = parseBetween($moviename,"<FONT color=#003399>","</FONT></A>");
- $title = removeTag($title);
+ my ($movienum, $moviename) = split(".html\" class=\"link1\">", $sub);
+ $title = removeTag($moviename);
$moviename = removeTag($moviename);
my ($movieyear)= $moviename =~/\((\d+)\)/;
if ($movieyear){$title = $title." (".$movieyear.")"; }
@@ -337,11 +358,18 @@
$finish = index($data, $end, $start + 1);
# add to array
+# my $movienameutf8 =
+ utf8::encode($moviename);
$movies[$count++] = $movienum . ":" . $moviename;
}
# display array of values
- for $movie (@movies) { print "$movie\n"; }
+ for $movie (@movies) {
+ if (defined $opt_u) {
+ utf8::encode($movie);
+ }
+ print "$movie\n";
+ }
}
}
}
@@ -351,7 +379,17 @@
#
# parse command line arguments
-getopts('ohrdivDMP');
+
+ GetOptions( "utf8" => \$opt_u, # --utf8
+ "version" => \$opt_v,
+ "info" => \$opt_i,
+ "originaltitle" => \$opt_originaltitle,
+ "casting" => \$opt_casting,
+ "Data" => \$opt_D,
+ "Movie" => \$opt_M,
+ "Poster" => \$opt_P
+ );
+
# print out info
if (defined $opt_v) { version(); exit 1; }
More information about the mythtv-dev
mailing list