#!/usr/bin/perl -w
#
# Build the database of info (in plain text format so it can be easily
# edited by hand) about all the doctor who episodes.
#
# Use -f option to force updating time and size of all video files when
# doing the update.
#

use strict;

my $topdir="/huge/vids";
my $whodir="$topdir/DoctorWho";
my $dbdir="$whodir/.data";
my $dbfile="$dbdir/allinfo.txt";
my %db;
my %dirdb;

# Get PATH set to include this script's directory and other useful bits

my $newpath=`dirname $0`;
chomp($newpath);
$newpath=`$newpath/echo-path`;
chomp($newpath);
$ENV{'PATH'}=$newpath;

my $force = (scalar(@ARGV) > 0) && ($ARGV[0] eq "-f");

# Utility routine to check file1 to see if it is newer than file2
# always returns 1 if either file does not exist.
#
sub samefile {
   my $file1 = shift;
   my $file2 = shift;
   my @f1 = stat($file1);
   my @f2 = stat($file2);
   return ((scalar(@f1) > 1) && (scalar(@f2) > 1) && ($f1[1] == $f2[1]) &&
           ($f1[0] == $f2[0]));
}

# Utility routine to check file1 to see if it is the same as file2
# (i.e. same dev and inode).
#
sub newerthan {
   my $file1 = shift;
   my $file2 = shift;
   my @f1 = stat($file1);
   my @f2 = stat($file2);
   return (scalar(@f1) < 10) || (scalar(@f2) < 10) || ($f1[9] > $f2[9]);
}

# Utility routine to run midentify on video file and return length
# in seconds.
#
sub getvidseconds {
   my $vidfile = shift;
   my $avh;
   my $seconds;
   if (open($avh, '-|', 'midentify', "$vidfile")) {
      while (<$avh>) {
         if (/^ID_LENGTH=(.+)$/) {
            $seconds = $1;
         }
      }
      close($avh);
   }
   return $seconds;
}

# Read the info.txt file with the html table data copied from the
# original TiVo directory web page. Return the episode description
# with various formatting cleaned up.
#
sub getdescription {
   my $dirname = shift;
   my $inf;
   my $description;
   if (open($inf, '<', "$whodir/$dirname/info.txt")) {
      while (<$inf>) {
         if (/Doctor Who:/) {
            chomp;

            # Remove boring and repetitive copyright notice at end.
            s/Copyright.*\<\/td\>/\<\/td\>/;

            # Remove leading and trailing table data html tags
            s/^\s*\<td\b[^\>]*\>//;
            s/\<\/td\>//;

            # And leading and trailing spaces.
            s/^\s+//;
            s/\s+$//;

            $description = $_;
            last;
         }
      }
      close($inf);
   }
   return $description;
}

# Check for the .edl file generated by running comskip. If it exists and the
# first commercial skip starts within 2 seconds of the beginning of the mpg
# file, then it is "good", otherwise it is "bad".
#
# I now convert the .edl into a .keep, and if I've created a .keep with
# no guess- prefix, then there are no edl errors (because I manually fixed
# them in the .keep file).
#
sub edlstatus {
   my $dirname = shift;
   my $basefile = shift;
   my $inf;
   my $edl='bad';
   if (-f "$whodir/$dirname/$basefile.keep") {
      $edl = 'good';
   } else {
      if (open($inf, '<', "$whodir/$dirname/$basefile.edl")) {
         my $line = <$inf>;
         chomp($line);
         my @ed = split(' ', $line);
         if (scalar(@ed) == 3) {
            if ($ed[0] <= 2.0) {
               $edl='good';
            }
         }
      }
   }
   return $edl;
}

# What with marking some directories bad and asking to re-record
# episodes (or the TiVo just re-recording them by itself), I might
# have multiple directories with copies of the full .mpg file. This
# routine records the directory info for each directory under a
# top level hash indexed by the basename. I can sort through any
# duplicates and decide which directory has the best copy to include
# in the final database file.
#
sub accumdirinfo {
   my $basefile = shift;
   my $recdate = shift;
   my $dirname = shift;
   my $goodir = shift;
   my $topr = $dirdb{$basefile};
   if (! defined($topr)) {
      $topr = {};
      $dirdb{$basefile} = $topr;
   }
   my $r = {};
   $topr->{$dirname} = $r;
   my $mainr = $db{$basefile};
   if (! defined($mainr)) {
      $db{$basefile} = $r;
   }
   $r->{'score'} = 0;
   if (defined($mainr) &&
       exists($mainr->{'mpgdirname'}) &&
       ($mainr->{'mpgdirname'} eq $dirname) &&
       (! newerthan("$whodir/$dirname/$basefile.mpg", $dbfile))) {
      if (exists($mainr->{'edl'})) {
         $r->{'edl'} = $mainr->{'edl'};
      }
      if (exists($mainr->{'mpgseconds'})) {
         $r->{'mpgseconds'} = $mainr->{'mpgseconds'};
      }
      if (exists($mainr->{'description'})) {
         $r->{'description'} = $mainr->{'description'};
      }
      if (exists($mainr->{'mpgsize'})) {
         $r->{'mpgsize'} = $mainr->{'mpgsize'};
      }
      if (exists($mainr->{'mpg'})) {
         $r->{'mpg'} = $mainr->{'mpg'};
      }
      $r->{'score'} += 2000;
   }
   $r->{'mpgdirname'} = $dirname;
   $r->{'recdate'} = $recdate;
   if (! $goodir) {
      $r->{'score'} -= 1000;
   }
   if (! exists($r->{'edl'})) {
      $r->{'edl'} = edlstatus($dirname, $basefile);
      if ($r->{'edl'} eq 'bad') {
         $r->{'score'} -= 100;
      }
   }
   if ($force || (! exists($r->{'mpgseconds'}))) {
      my $seconds = getvidseconds("$whodir/$dirname/$basefile.mpg");
      if (defined($seconds)) {
         $r->{'mpgseconds'} = $seconds;
         if ($seconds >= (61.9*60)) {
            $r->{'score'} += 200;
         } elsif ($seconds < (60.1*60)) {
            $r->{'score'} -= 200;
         }
      } else {
         $r->{'score'} -= 500;
      }
   }
   if (! exists($r->{'description'})) {
      my $description = getdescription($dirname);
      if (defined($description)) {
         $r->{'description'} = $description;
      }
   }
   if ($force || (! exists($r->{'mpgsize'}))) {
      my @mpstat = stat("$whodir/$dirname/$basefile.mpg");
      if (scalar(@mpstat) > 7) {
         $r->{'mpgsize'} = $mpstat[7];
         $r->{'mpg'} = 1;
      }
   }
   if (! exists($r->{'mpg'})) {
      if (-f "$whodir/$dirname/$basefile.mpg") {
         $r->{'mpg'} = 1;
      }
   }
   if (defined($mainr)) {
      my $airdate = $mainr->{'airdate'};
      if (defined($airdate) && samefile("$whodir/$airdate-$basefile.avi",
                                        "$whodir/$dirname/$basefile.avi")) {
         $r->{'score'} += 1000000;
      }
   }
}

# Read in the existing database (if any) to start with known data
# (which may no longer exist anywhere else). Do not record the 'avi'
# and 'mpg' flags - need to verify the existence of such files
# again during the rebuild of the database.

my $fh;
my $r;
if (open($fh, '<', $dbfile)) {
   while (<$fh>) {
      chomp;
      if (/^\[(.+)\]$/) {
         my $basename = $1;
         $r = {};
         $db{$basename} = $r;
      } elsif (/^([A-Za-z0-9_]+)=(.+)$/) {
         if (defined($r)) {
            my $key = $1;
            my $val = $2;
            if (! (($key eq 'avi') || ($key eq 'mpg'))) {
               $r->{$key} = $val;
            }
         }
      }
   }
   close($fh);
   undef($fh);
   undef($r);
}

# Read the list of .avi files in $whodir to find any basenames and
# airdates that might not already be in the database. If the .avi file
# is newer than the database file, also update the length info in
# the database.

my $dh;
my @whodirnames;
if (opendir($dh, $whodir)) {
   @whodirnames = readdir($dh);
   closedir($dh);
   undef $dh;
}
my $n;
foreach $n (@whodirnames) {
   if ($n=~/^(\d+-\d+-\d+)-([A-Za-z0-9_]+)\.avi$/) {
      my $airdate = $1;
      my $basename = $2;
      $r = $db{$basename};
      if (! defined($r)) {
         $r = {};
         $db{$basename} = $r;
      }
      if (! exists($r->{'airdate'})) {
         $r->{'airdate'} = $airdate;
      }
      my $newavi = newerthan("$whodir/$n",$dbfile);
      if ((! exists($r->{'aviseconds'})) || $newavi || $force) {
         my $aviseconds = getvidseconds("$whodir/$n");
         if (defined($aviseconds)) {
            $r->{'aviseconds'} = $aviseconds;
         }
      }
      if ((! exists($r->{'avisize'})) || $newavi || $force) {
         $r->{'avisize'} = (stat("$whodir/$n"))[7];
      }
      $r->{'avi'} = 1;
   }
}

# Now read the directories to find any original .mpg files and other
# info stashed in the download directories.

foreach $n (@whodirnames) {
   if ($n=~/^(\d+-\d+-\d+)-\d+-([A-Za-z0-9_]+)$/) {
      accumdirinfo($2, $1, $n, 1);
   } elsif ($n=~/^bad-(\d+-\d+-\d+)-\d+-([A-Za-z0-9_]+)$/) {
      accumdirinfo($2, $1, $n, 0);
   }
}

# Go through all the info gathered from directories and pick the
# highest score directory to copy info into main database.

my $bn;
foreach $bn (keys(%dirdb)) {
   my $topr = $dirdb{$bn};
   my $dirname;
   my $topscore;
   my $topref;
   foreach $dirname (keys(%{$topr})) {
      my $r = $topr->{$dirname};
      if ((! defined($topscore)) || ($topscore < $r->{'score'})) {
         $topscore = $r->{'score'};
         $topref = $r;
      }
   }
   my $mainr = $db{$bn};
   if (defined($topref) && defined($mainr)) {
      my $key;
      foreach $key (keys(%{$topref})) {
         if ($key ne 'score') {
            my $val = $topref->{$key};
            $mainr->{$key} = $val;
         }
      }
   }
}

# Save new db (keeping backup)

sub compare_airdate {
   my $ada = $db{$a}->{'airdate'};
   my $adb = $db{$b}->{'airdate'};
   if (! defined($ada)) {
      $ada='';
   }
   if (! defined($adb)) {
      $adb='';
   }
   my $rval = $ada cmp $adb;
   if ($rval == 0) {
      $rval = $a cmp $b;
   }
   return $rval;
}

my $dbtemp="$dbfile.$$";
my $dbh;
my $missing_airdate = 0;
if (open($dbh, '>', $dbtemp)) {
   foreach $bn (sort compare_airdate keys %db) {
      $r = $db{$bn};
      if (! exists($r->{'airdate'})) {
         $missing_airdate = 1;
      }
      print $dbh "\n[$bn]\n";
      my $key;
      my $val;
      foreach $key (sort(keys(%{$r}))) {
         $val = $r->{$key};
         print $dbh "$key=$val\n";
      }
   }
   close($dbh);
   unlink("$dbfile.bak");
   link($dbfile,"$dbfile.bak");
   unlink($dbfile);
   link($dbtemp,$dbfile);
   unlink($dbtemp);
}

if ($missing_airdate) {

   # I'm missing airdates for some episodes, update the database
   # from the Doctor Who wiki page to fill in any new airdates
   # discovered since the last time I did this.

   system("update-airdates");
}