[mythtv-commits] Ticket #9074: allocine script for new metadata schema and allocine API

MythTV mythtv at cvs.mythtv.org
Fri Oct 8 15:38:57 UTC 2010


#9074: allocine script for new metadata schema and allocine API
--------------------------------------------------------+-------------------
 Reporter:  Alexandra Lepercq <alexandra@…>             |            Type:  enhancement     
   Status:  new                                         |        Priority:  minor           
Milestone:  unknown                                     |       Component:  MythTV - General
  Version:  Unspecified                                 |        Severity:  medium          
 Keywords:                                              |   Ticket locked:  0               
--------------------------------------------------------+-------------------
 Allocine has release an API for the metadata (thanks to
 http://wiki.gromez.fr/dev/api/allocine):
 http://api.allocine.fr/xml/movie?code=$movieid&partner=3

 I have made some modification based on the allocine.pl script from Xavier
 Hervy to be consistant with the allocine API and the mythtv 0.24 metadata
 schema

 I hope this helps



 #!/usr/bin/perl -w

 #
 # This perl script is intended to perform movie data lookups in french
 based on
 # the www.allocine.fr website
 #
 # For more information on MythVideo's external movie lookup mechanism, see
 # the README file in this directory.
 #
 # Original author: Xavier Hervy (maxpower44 AT tiscali DOT fr)

 # changes:
 #   20-10-2009: Geoffroy Geerseau ( http://www.soslinux.net : jamdess AT
 soslinux DOT net )
 #   Modified for the new allocine templates
 #   25-10-2009: Geoffroy Geerseau ( http://www.soslinux.net : jamdess AT
 soslinux DOT net )
 #   Poster download correction
 #   Userrating correction
 #   28-10-2009: Robert McNamara (Myth Dev)
 #   Fix issues in above patches-- files should never be downloaded to
 /tmp.
 #   Convert script to output in new grabber output format for .23.  Leave
 backwards compat.
 #   02-11-2009: Geoffroy Geerseau
 #   Allocine have, once again, change their templates...
 #   06-08-2010: Alexandra Lepercq
 #   Allocine have, once again, change their templates...
 #   Add some data from api.allocine.fr (thanks to
 http://wiki.gromez.fr/dev/api/allocine)
 #       http://api.allocine.fr/xml/movie?code=$movieid&partner=3

 use File::Basename;
 use File::Copy;
 use lib dirname($0);
 use Encode;
 use utf8;
 use Encode 'from_to';
 use MythTV::MythVideoCommon;

 use vars qw($opt_h $opt_r $opt_d $opt_i $opt_v $opt_D $opt_l $opt_M $opt_P
 $opt_originaltitle $opt_casting $opt_u_dummy);
 use Getopt::Long;

 $title = "Allocine Query";
 $version = "v2.06";
 $author = "Xavier Hervy";
 push(@MythTV::MythVideoCommon::URL_get_extras, ($title, $version));

 binmode(STDOUT, ":utf8");

 # display usage
 sub usage {
    print "usage: $0 -hviocMPD [parameters]\n";
    print "       -h, --help                       help\n";
    print "       -v, --version                    display version\n";
    print "       -i, --info                       display info\n";
    print "       -o, --originaltitle              concatenate title and
 original title\n";
    print "       -c, --casting                    with -D option, grap the
 complete actor list (much slower)\n";
    print "\n";
    print "       -M <query>,   --movie query>     get movie list\n";
    print "       -D <movieid>, --data <movieid>   get movie data\n";
    print "       -P <movieid>, --poster <movieid> get movie poster\n";
    exit(-1);
 }

 # display 1-line of info that describes the version of the program
 sub version {
    print "$title ($version) by $author\n"
 }

 # display 1-line of info that can describe the type of query used
 sub info {
    print "Performs queries using the www.allocine.fr website.\n";
 }

 # display detailed help
 sub help {
    version();
    info();
    usage();
 }

 # returns text within 'data' without tag
 sub removeTag {
    my ($data)=@_; # grab parameters

    my $ldata = lc($data);
    my $start = index($ldata, "<");
    my $finish = index($ldata, ">", $start)+1;
    while ($start != -1 && $finish != -1){
       $data = substr($data, 0, $start).substr($data, $finish,
 length($data));
       $ldata = lc($data);
       $start = index($ldata, "<");
       $finish = index($ldata, ">", $start)+1;
    }
    return $data;
 }


 # get Movie Data
 sub getMovieData {
    my ($movieid)=@_; # grab movieid parameter
    if (defined $opt_d) { printf("# looking for movie id: '%s'\n",
 $movieid);}

    # get Movie MetaData from api.allocine
    $requestAPI =
 "http://api.allocine.fr/xml/movie?code=$movieid&partner=3";
    $responseAPI = myth_url_get($requestAPI);
    from_to($responseAPI,'utf-8','iso-8859-1');


    # get the search results  page
    my $request = "http://www.allocine.fr/film/fichefilm_gen_cfilm=" .
 $movieid . ".html";
    my $allocineurl = $request;
    if (defined $opt_d) { printf("# request: '%s'\n", $request); }
    my ($rc, $response) = myth_url_get($request);
    from_to($response,'utf-8','iso-8859-1');


    # parse Title and Year
 #   my $title = parseBetween($response, "<title>", "</title>");
 #   $title =~ s/\s*-\s*AlloCin.*//;
 #   $title =~ s/(.*)\(.*$/$1/;
 #   $title =~ s/^\s*(.*)\s*$/$1/;
 #   my $original_title = parseBetween($response, "Titre original
 :","<br");
 #   $original_title = trim(removeTag($original_title));
 #   if (defined $opt_originaltitle){
 #      if ($original_title ne  ""){
 #        $title = $title . " (" . $original_title . ")";
 #      }
 #   }
 #   $title = removeTag($title);
 #   my $year =
 parseBetween(parseBetween($response,"/film/tous/decennie","/a>"),'>','<');
    my $titleApi = parseBetween($responseAPI,"<title>","</title>");
    my $originaltitleApi =
 parseBetween($responseAPI,"<originalTitle>","</originalTitle>");
    my $yearApi =
 parseBetween($responseAPI,"<productionYear>","</productionYear>");


    # parse Director
 #   my $tempresponse = $response;
 #   my $director = parseBetween($tempresponse,"Réalisé par
 ","</a></span>");
 #   $director = removeTag($director);

 #   my $directorApi =
 parseBetween($responseAPI,"<directors>","</directors>");


    # parse Plot
 #   my $plot = parseBetween($response,"Synopsis : </span>","</p>");
 #   $plot =~ s/\n//g;
 #   $plot = trim(removeTag($plot));
    my $plotApi = parseBetween($responseAPI,"<synopsis>","</synopsis>");


    # parse User Rating
 #   my $userrating=0;
 #   my $tmpratings =
 parseBetween(parseBetween($response,"/film/critiquepublic_gen_cfilm=$movieid.html\"><img",
 "</span></p></div>"),'(',')');
 #   $tmpratings =~ s/,/./gm;
 #   if($tmpratings =~ /^(\d+\.?\d*|\.\d+)$/ && !$tmpratings eq "")
 #   {
 #       $userrating = int($tmpratings*2.5);
 #   }
 #   else
 #   {
 #       $userrating =  "";
 #   }
    my $userratingOrig =
 parseBetween($responseAPI,"<userRating>","</userRating>");
    $userratingApi = int($userratingOrig * 2.5);


    # parse Rating
    my $movierating = parseBetween($response,"Interdit aux moins de
 ","ans");
    if (!($movierating eq ""))
         { $movierating = "Interdit -" . $movierating . "ans";}
    else
         {
                 $movierating = parseBetween($response,"Visible
 ","enfants");
                 if (!($movierating eq "")){ $movierating = "Enfants";};
         }

    my $movieratingTout =
 parseBetween($responseAPI,"<ratingStats>","</ratingStats>");


    # parse Movie length
 #   my $runtime = trim(parseBetween($response,"Durée :","min"));
 #   my $heure;
 #   my $minutes;
 #   ($heure,$minutes)=($runtime=~/[^\d]*(\d+)[^\d]*(\d*)/);
 #   if (!$heure){ $heure = 0; }
 #   if (!$minutes){
 #      $runtime = $heure * 60;
 #   }else{
 #       $runtime = $heure * 60 + $minutes;
 #   }
    my $runtimeOrig = parseBetween($responseAPI,"<runtime>","</runtime>");
    $runtimeApi = $runtimeOrig / 60;


    # parse Cast
 #   my $castchunk;
 #   $castchunk = parseBetween($response, "      Avec ","<a
 href=\"/film/casting_gen_cfilm=$movieid.html\" >plus</a>");
 #   my $cast = "";
 #   $cast = trim(join(',', removeTag($castchunk)));
    my $castApi = parseBetween($responseAPI,"<casting>","</casting>");
    $castApi =~ s!<castMember>!\n!g;
    $castApi =~ s!</castMember>!/>!g;
    $castApi =~ s/person code/person name/g;
    $castApi =~ s!</person>!"!g;
    $castApi =~ s/<activity code/ job/g;
    $castApi =~ s!</activity>!"!g;
    $castApi =~ s/<role>/ character="/g;
    $castApi =~ s!</role>!"!g;
    $castApi =~ s![0-9]!!g;
    $castApi =~ s!">!!g;
    $castApi =~ s/<picture href/ picture/g;
    $castApi =~ s!</picture>!"!g;
    $castApi =~
 s!picture="http://images.allocine.fr/medias/nmedia/////.jpg"!!g;
    $castApi =~ s!Réalisateur!director!g;
    $castApi =~ s!Acteur!actor!g;
    $castApi =~ s!Producteur!producer!g;
    $castApi =~ s!Compositeur!composer!g;


    #Genres
 #   my $genres = parseBetween($response,"Genre :","<br");
 #   $genres =~ s/\s*\n*(.*)\s*$/ $1/;
 #   $genres = trim(removeTag($genres));
 #   $genres =~ s/\s*\n*(.*)\s*$/ $1/;
    my $genreApi = parseBetween($responseAPI,"<genreList>","</genreList>");
    $genreApi =~ s/genre code/category name/g;
    $genreApi =~ s!</genre>!"/>\n!g;
    $genreApi =~ s![0-9]!!g;
    $genreApi =~ s!">!!g;
 #   $genreApi =~ s!Musical!Comédie musicale!g;
 #   $genreApi =~ s!Action!Aventure, Action!g;
 #   $genreApi =~ s!Aventure!!g;


    #Countries
 #   my $countries = parseBetween($response,"Long-métrage",".");
 #   $countries = trim(removeTag($countries));
 #   $countries =~ s/\s*(.*)\s*$/ $1/;
 #   $countries = trim($countries);
 #   $countries =~ s/\n//gm;
 #   $countries =~ s/\s//gm;
 #   $countries =~ s/,/, /gm;
 #   if ($countries eq "allemand") { $countries = "Allemagne"; }
 #   if ($countries eq "américain") { $countries = "États-Unis"; }
 #   if ($countries eq "autrichien") { $countries = "Autriche"; }
 #   if ($countries eq "britannique") { $countries = "Royaume Uni"; }
 #   if ($countries eq "canadien") { $countries = "Canada"; }
 #   if ($countries eq "français") { $countries = "France"; }
 #   if ($countries eq "italien") { $countries = "Italie"; }
 #   if ($countries eq "russe") { $countries = "Russie"; }
    my $countryApi =
 parseBetween($responseAPI,"<nationalityList>","</nationalityList>");
    $countryApi =~ s/nationality code/country name/g;
    $countryApi =~ s!</nationality>!"/>\n!g;
    $countryApi =~ s![0-9]!!g;
    $countryApi =~ s!">!!g;
 #   $countryApi =~ s!Grande-Bretagne!Royaume Uni!g;
 #   $countryApi =~ s!U.S.A.!États-Unis!g;


    # parse for Coverart
 #   my $mediafile = parseBetween($response,"<a
 href=\"/film/fichefilm-".$movieid."/affiches/detail/?cmediafile=","\" >");
 #   $covrequest =
 "http://www.allocine.fr/film/fichefilm-".$movieid."/affiches/detail/?cmediafile=".$mediafile;
 #   ($rc, $covresponse) = myth_url_get($covrequest);
 #   my $uri = parseBetween(parseBetween($covresponse,"<div class=\"tac\"
 style=\"\">","</div>"),"<img src=\"","\" alt");
    $request =
 "http://www.allocine.fr/film/fichefilm-".$movieid."/affiches/";
    ($rc, $response) = myth_url_get($request);
    my $mediafile = parseBetween($response,"<a
 href=\"/film/fichefilm-".$movieid."/affiches/detail/?cmediafile=","\" >");
    $request2 =
 "http://www.allocine.fr/film/fichefilm-".$movieid."/affiches/detail/?cmediafile=".$mediafile;
    ($rc, $response2) = myth_url_get($request2);
    $uri = trim(parseBetween($response2,"<a Target=\"_blank\"
 Class=\"fs11\" href=\"","\">Agrandir</a>"));
    if ($uri eq "")
    {
         $request =
 "http://www.allocine.fr/film/fichefilm-".$movieid."/affiches/";
         ($rc, $response) = myth_url_get($request);
         my $tmp_uri = parseBetween($response, "<a
 href=\"/film/fichefilm-".$movieid."/affiches/\">"," alt=");
         $tmp_uri =~ s/\n/ /gm;
         $uri = trim(parseBetween($tmp_uri,"<img src='h","'"));
         if($uri ne "")
         {
                 $uri = "h$uri";
         }
    }
    # if no picture was found, just download the empty poster
    if($uri eq ""){
         $uri =
 "http://images.allocine.fr/r_160_214/commons/emptymedia/AffichetteAllocine.gif";
    }


    # output fields (these field names must match what MythVideo is looking
 for)
 #   print "Title:$title\n";
 #   if (!(defined $opt_originaltitle)){
 #    print "OriginalTitle:$original_title\n";
 #   }
 #   print "URL:$allocineurl\n";
 #   print "Year:$year\n";
 #   print "Director:$director\n";
 #   print "Plot:$plot\n";
 #   print "UserRating:$userrating\n";
 #   print "MovieRating:$movierating\n";
 #   print "Runtime:$runtime\n";
 #   print "Cast:$cast\n";
 #   print "Genres:$genres\n";
 #   print "Countries:$countries\n";
 #   print "Coverart: $uri\n";

 #   print "\n";
 #   print "OriginaltitleApi:$originaltitleApi\n";
 #   print "MovieratingTout:$movieratingTout\n";
 #   print "\n";



    # MetaData output
 print "<?xml version='1.0' encoding='UTF-8'?>\n";
 print "<metadata>\n";
   print "<item>\n";
     print "<inetref>$movieid</inetref>\n";
     print "<title>$titleApi</title>\n";
     print "<language>fr</language>\n";
     print "<description>$plotApi</description>\n";
     print "<countries>\n";
 #      print "<country name=\"$countries\"/>\n";
       print "$countryApi";
     print "</countries>\n";
     print "<categories>\n";
 #      print "<category name=\"$genres\"/>\n";
       print "$genreApi";
     print "</categories>\n";
     print "<userrating>$userratingApi</userrating>\n";
 #    print "<movierating>$movierating</movierating>\n";
     print "<year>$yearApi</year>\n";
     print "<runtime>$runtimeApi</runtime>\n";
     print "<homepage>$allocineurl</homepage>\n";
 #    print "<trailerURL>$bandeannonceurl</trailerURL>\n";
     print "<people>";
 #      print "<person name=\"$director\" job=\"Director\"/>\n";
 #      print "<person name=\"$cast\" job=\"Actor\"/>\n";
       print "$castApi";
     print "</people>\n";
     print "<images>\n";
       print "<image type=\"coverart\" url=\"$uri\"/>\n";
 #      print "<image type=\"fanart\" url=\"$fanarturi\"/>\n";
 #      print "<image type=\"screenshot\" url=\"$screenshoturi\"/>\n";
 #      print "<image type=\"banner\" url=\"$banneruri\"/>\n";
     print "</images>\n";
   print "</item>\n";
 print "</metadata>\n";



 }

 # dump Movie Poster
 sub getMoviePoster {
    my ($movieid)=@_; # grab movieid parameter
    if (defined $opt_d) { printf("# looking for movie id: '%s'\n",
 $movieid);}

    # get the search results  page

    my $request =
 "http://www.allocine.fr/film/fichefilm-".$movieid."/affiches/";
    if (defined $opt_d) { printf("# request: '%s'\n", $request); }
    my ($rc, $response) = myth_url_get($request);
    my $mediafile = parseBetween($response,"<a
 href=\"/film/fichefilm-".$movieid."/affiches/detail/?cmediafile=","\" >");

    $request =
 "http://www.allocine.fr/film/fichefilm-".$movieid."/affiches/detail/?cmediafile=".$mediafile;
    ($rc, $response) = myth_url_get($request);
    my $uri = parseBetween(parseBetween($response,"<div class=\"tac\"
 style=\"\">","</div>"),"<img src=\"","\" alt");
    if ($uri eq "")
    {
         $request =
 "http://www.allocine.fr/film/fichefilm-".$movieid."/affiches/";
         ($rc, $response) = myth_url_get($request);
         my $tmp_uri = parseBetween($response, "<a
 href=\"/film/fichefilm-".$movieid."/affiches/\">"," alt=");
         $tmp_uri =~ s/\n/ /gm;
         $uri = trim(parseBetween($tmp_uri,"<img src='h","'"));
         if($uri ne "")
         {
                 $uri = "h$uri";
         }
         print "$uri\n";
    }

    # if no picture was found, just download the empty poster
    if($uri eq ""){
         $uri =
 "http://images.allocine.fr/r_160_214/commons/emptymedia/AffichetteAllocine.gif";
    }

    print "$uri\n";
 }

 sub getMovieList {
         my ($filename, $options) = @_; # grab parameters

         my $query = cleanTitleQuery($filename);
         if (!$options) { $options = ""; }
         if (defined $opt_d) {
                 printf("# query: '%s', options: '%s'\n", $query,
 $options);
         }

         # get the search results  page
         my $request = "http://www.allocine.fr/recherche/1/?q=$query";
         if (defined $opt_d) { printf("# request: '%s'\n", $request); }
         my ($rc, $response) = myth_url_get($request);
         from_to($response,'utf-8','iso-8859-1');
         $response =~ s/\n//g;
         # extract possible matches
         #    possible matches are grouped in several catagories:
         #        exact, partial, and approximate
         my $exact_matches = $response;
         # parse movie list from matches
         my $beg = "<div style=\"margin-top:-5px;\">";
         my $end = "<span class=\"fs11\">";

         my @movies;

         my $data = $exact_matches;
         if ($data eq "") {
                 if (defined $opt_d) { printf("# no results\n"); }
         } else {
                 my $start = index($data, $beg);
                 my $finish = index($data, $end, $start);

                 my $title;
                 my $movienum;
                 my $moviename;
                 while ($start != -1) {
                         $start += length($beg);
                         my $sub1 = substr($data, $start, $finish -
 $start);
                         $sub1 =~ s/(.*)\(.*$/$1/;
                         $moviename = trim(removeTag($sub1));
                         $movienum = parseBetween($sub1,"<a
 href='/film/fichefilm_gen_cfilm=",".html");

                         $title = removeTag($moviename);
                         $moviename = removeTag($moviename);
                         my ($movieyear)= $moviename =~/\((\d+)\)/;
                         if ($movieyear) {
                                 $title = $title." (".$movieyear.")";
                         }
                         $moviename=$title ;

                         # advance data to next movie
                         $data = substr($data, - (length($data) -
 $finish));
                         $start = index($data, $beg);
                         $finish = index($data, $end, $start);

                         # add to array
                         push(@movies, "$movienum:$moviename");
                 }

                 # display array of values
                 for $movie (@movies) {
                         print "$movie\n";
                 }
         }
 }

 #
 # Main Program
 #

 # parse command line arguments

     GetOptions( "utf8" => \$opt_u_dummy,
                 "version" => \$opt_v,
                 "info" => \$opt_i,
                 "language" => \$opt_l,
                 "originaltitle" => \$opt_originaltitle,
                 "casting" => \$opt_casting,
                 "Data" => \$opt_D,
                 "Movie" => \$opt_M,
                 "Poster" => \$opt_P
                 );


 # print out info
 if (defined $opt_v) { version(); exit 1; }
 if (defined $opt_i) { info(); exit 1; }
 if (defined $opt_l) {
     my $lang = shift;
 }

 # print out usage if needed
 if (defined $opt_h || $#ARGV<0) { help(); }

 if (defined $opt_D) {
    # take movieid from cmdline arg
    $movieid = shift || die "Usage : $0 -D <movieid>\n";
    getMovieData($movieid);
 }

 elsif (defined $opt_P) {
    # take movieid from cmdline arg
    $movieid = shift || die "Usage : $0 -P <movieid>\n";
    getMoviePoster($movieid);
 }

 elsif (defined $opt_M) {
    # take query from cmdline arg
    #$options = shift || die "Usage : $0 -M <query>\n";
    my $query;
    my $options = '';
    foreach $key (0 .. $#ARGV) {
         $query .= $ARGV[$key]. ' ';
    }
    getMovieList($query, $options);
 }
 # vim: set expandtab ts=3 sw=3 :

-- 
Ticket URL: <http://svn.mythtv.org/trac/ticket/9074>
MythTV <http://www.mythtv.org/>
MythTV Media Center


More information about the mythtv-commits mailing list