#!/usr/bin/ruby -w require 'optparse' require 'ostruct' require 'net/http' require 'uri' require 'rexml/document' include REXML NOW = Time.now ONE_DAY = 86400 class Grabber PROGRAM = $0.sub( /^.*\//, '' ) VERSION = '0.9.9' RELEASE = '$Id: tv_grab_nl_upc,v 1.133 2008/06/17 05:26:30 ianmacd Exp $' GUIDE = 'http://epg.upc.nl/Entertainment/TV_gids/' DESC = 'UPC Cable TV, The Netherlands (%s)' % [ GUIDE ] URL = 'http://www.caliban.org/ruby/xmltv_upc.shtml' CAPABILITIES = %w[ baseline manualconfig ] WORKING_DIR = File.expand_path( '~/.xmltv' ) end class Rating IMDB_URL = 'http://uk.imdb.com' RATINGS_FILE = File.join( Grabber::WORKING_DIR, 'ratings.list' ) RATINGS_FILE_GZ = File.join( Grabber::WORKING_DIR, 'ratings.list.gz' ) RATINGS_CACHE_FILE = 'ratings_cache.yaml' @@rated = 0 @@not_rated = 0 @@cache = {} @@cache_positives = 0 @@cache_negatives = 0 @@ratings = {} attr_reader :timestamp attr_accessor :rating, :ctime, :mtime def Rating.get_ratings_list( options ) # Download the ratings file if it doesn't exist or is older than 7 days. # See http://www.imdb.com/interfaces#plain for more information. # if ! File.exist?( RATINGS_FILE ) || ( NOW - File.mtime( RATINGS_FILE ) ) / ONE_DAY > 7 if options.verbose $stderr.puts 'Attempting to download IMDB ratings list via FTP...' end require 'net/ftp' Net::FTP.open( 'ftp.funet.fi' ) do |ftp| ftp.debug_mode = true if options.debug ftp.login ftp.chdir( '/pub/mirrors/ftp.imdb.com/pub' ) ftp.passive = true ftp.getbinaryfile( 'ratings.list.gz', RATINGS_FILE_GZ, 1024 ) end require 'shell' begin gunzip = Shell::CommandProcessor.new( Shell.new ). find_system_command( 'gunzip' ) # If we don't have gunzip in our PATH, an exception will be raised. # rescue Shell::Error::CommandNotFound $stderr.puts 'Please gunzip %s and then rerun.' % [ RATINGS_FILE_GZ ] exit end # Decompress ratings file. # $stderr.puts 'Decompressing ratings file...' if options.verbose system( gunzip, '-f', RATINGS_FILE_GZ ) end end def Rating.cache_rating( title, options, rating ) if rating $stderr.puts ' Succeeded: %s.' % [ rating ] if options.verbose # Cache the result and return a rating, e.g. 7/10. # @@cache[title] = Rating.new( rating ) @@rated += 1 @@cache[title].rating else $stderr.puts ' Failed.' if options.verbose # Cache a negative result and return nil. # @@cache[title] = Rating.new( false ) @@not_rated += 1 nil end end def Rating.imdb_rating( title, options ) # Titles come from UPC as UTF-8, so we check for any accented alpha # characters in the title and, if we find them, convert the title to # Latin-1 (ISO-8859-1). # if title =~ /\xC3[\x80-\xBF]/ title = title.unpack( 'U*' ).pack( 'c*' ) end if options.verbose $stderr.puts ' Attempting to rate "%s"...' % [ title ] end if @@cache.key?( title ) # Update timestamp on cache entry. # @@cache[title].mtime = Time.now if @@cache[title].rating if options.verbose $stderr.puts ' Found in cache: %s.' % [ @@cache[title].rating ] end @@cache_positives += 1 return @@cache[title].rating elsif @@cache[title].rating == false $stderr.puts ' Negative result found in cache.' if options.verbose @@cache_negatives += 1 return nil end end if options.static_ratings rating = nil # If static ratings are desired, create the hash of titles and ratings # from the static IMDB ratings file. # if @@ratings.empty? $stderr.puts ' Reading ratings file...' if options.verbose File.open( RATINGS_FILE ) do |f| rating = nil body = false f.each_line do |l| # Get to the right point in the file. # body = true if l =~ /^MOVIE RATINGS REPORT/ next unless body if m = l.match( /^\s+\S{10}\s+\d+\s+(\d+\.\d+)\s+(.+)$/ ) # Remove year and any other trailing stuff from film title and # convert it to lower case. # t = m[2].sub( /\s*\(\d+\).*$/, '' ).downcase # Store the rating. # @@ratings[t] = m[1] end end if options.verbose $stderr.puts ' Read %d ratings.' % [ @@ratings.size ] end end end l_title = title.downcase rating = @@ratings[l_title] + '/10' if @@ratings.key?( l_title ) return Rating.cache_rating( title, options, rating ) end # # Otherwise, grab the rating dynamically from IMDB. # guide_title = title.clone res = nil loop do # When parsing the URL, we have to remove any ampersand entities in the # title. # url = URI.parse( IMDB_URL + '/find?q=%s;s=tt' % [ url_encode( title.gsub( /&/, '&' ) ) ] ) req = Net::HTTP::Get.new( url.path + '?' + url.query ) res = get_page( url, req, options ) # This version of the title will be used for matching. It has all # non-alphanumeric characters made optional and all accented alpha # characters (and ampersand entities) replaced by a regex that will match # any HTML entity at that position in the string. # # Note that the # character must be escaped, as we will be interpolating # it into the /x style regex that follows. # mod_title = title.gsub( /&/, '&' ). gsub( /[^&[:alnum:]\xA0-\xFF]/, '.?' ). gsub( /[&\xA0-\xFF]/, '&(?:\#\d+|[[:alpha:]]+);' ) if m = res.match( / # Most film titles return 'Popular Titles', but occasionally one # will return just 'Exact Matches' (or even 'Approx Matches', which # we ignore). #
(?:Popular\sTitles | Titles\s\(Exact\sMatches\) )<\/b>\s+ # Now we can grab the URL path of the first film in the list, which # may not actually be the right one if the film has been remade. # \(Displaying\s\d+\sResults?\).+? # But we must also make sure that we really did match the same title, # not a substring of some other title. To achieve a slightly fuzzy # match, we match case-insensitively, using an embedded pattern-match # modifier (i.e. (?i:pattern) ) and a modified version of the title. # # This can be done, because the interpolation of the result from the # code below takes place before the regex is matched. # (?i:#{mod_title})<\/a> /x ) # Now we follow the URL path to get the ratings page. # url = URI.parse( IMDB_URL + m[1] ) req = Net::HTTP::Get.new( url.path ) res = get_page( url, req, options ) # Extract the rating from the page. # break elsif res !~ /[\d\.]+\/\d+<\/b>(?:.{0,32}\([\d,]+ votes<\/a>\))/m && title.sub!( /, The$/, '' ) # If we haven't just fetched the film's ratings page and its title # ends with ', The', we can try again with 'The' at the beginning of # the title instead. # title = 'The ' + title if options.debug $stderr.puts ' Reattempting as "%s"...' % [ title ] end redo else # We must be on the ratings page. # break end end # At this point, we should have a page that contains a rating for this # film. # m = res.match( /([\d\.]+\/\d+)<\/b>(?:.{0,32}\([\d,]+ votes<\/a>\))/m ) Rating.cache_rating( guide_title, options, m ? m[1] : nil ) end def Rating.load_cache( options ) require 'yaml' if File.exist?( options.ratings ) $stderr.puts 'Loading ratings cache file...' if options.verbose @@cache = File.open( options.ratings ) { |f| YAML.load( f ) } || {} # Expire any cache entries older than a week. # orig_size = @@cache.size positive = 0 negative = 0 $stderr.puts 'Expiring ratings cache entries...' if options.debug @@cache.delete_if do |title, entry| # Legacy: @mtime used to be @timestamp. # if entry.timestamp entry.mtime ||= entry.timestamp entry.ctime ||= entry.mtime # Remove old attribute using a temporary singleton method. # def entry.clean remove_instance_variable( '@timestamp' ) end entry.clean # Remove the singleton method we just created. # # class << entry; undef :clean; end end # Delete on this basis: # if ( NOW - entry.mtime ) / ONE_DAY > 7 true else # Or count the positive/negative cache entries. # entry.rating ? positive += 1 : negative += 1 false end end if options.verbose $stderr.puts '%d rating(s) expired. %d remaining, of which %d positive and %d negative.' % [ orig_size - @@cache.size, @@cache.size, positive, negative ] end else $stderr.puts 'No ratings cache file found.' nil end end def Rating.dump_cache( options ) $stderr.puts 'Dumping ratings cache file...' if options.verbose File.open( options.ratings, 'w' ) { |f| YAML.dump( @@cache, f ) } if options.verbose $stderr.puts '%d ratings cache entries written.' % [ @@cache.size ] end end def Rating.rated @@rated end def Rating.not_rated @@not_rated end def Rating.cache_positives @@cache_positives end def Rating.cache_negatives @@cache_negatives end def initialize( rating ) @rating = rating @ctime = @mtime = Time.now end end class ProgramOptions def self.parse( args ) o = OpenStruct.new op = OptionParser.new do |opt| opt.banner = "Usage: #{Grabber::PROGRAM} [options]" opt.version = VERSION opt.define( "\n All options may be specified using the shortest " + 'unique string.' ) opt.define( ' For example, --output-file may be abbreviated to ' + "--output or even --out.\n") opt.define( " See #{Grabber::URL} for further details.\n" ) # Capabilities are defined here: # # http://www.xmltv.org/wiki/xmltvcapabilities.html # opt.define( '--capabilities', "List this grabber's capabilities and exit." ) do puts Grabber::CAPABILITIES exit end o.cattrans = true opt.define( '--[no-]cattrans', 'Perform category translation.', "(default: #{o.cattrans})" ) do |trans| o.cattrans = trans end o.config = File.join( Grabber::WORKING_DIR, 'tv_grab_nl_upc.conf' ) opt.define( '--config-file FILE', 'File containing channel numbers and names.', '(default:', o.config + ')' ) do |file| o.config = File.expand_path( file ) end o.configure = false opt.define( '--configure', 'Create a configuration file', '(will overwrite existing file).' ) do |config| o.configure = config end o.days = 7 opt.define( '--days NUMBER', 'Fetch data for NUMBER days (1 = today only)', "(default: #{o.days})", Integer ) do |days| o.days = days end o.debug = false opt.define( '--debug', 'Print debugging messages. Debugging can', 'be dynamically toggled with SIGUSR1.' ) do |debug| o.debug = debug end opt.define( '--description', 'Describe this grabber and exit.' ) do puts Grabber::DESC exit end opt.define( '--help', 'Display this usage message and exit.' ) do puts opt exit end o.icons = false opt.define( '--icons', 'Include icon links for the channels.' ) do |icons| o.icons = icons end opt.define( '--logos', 'Alias for --icons.' ) { |icons| o.icons = icons } o.offset = 0 opt.define( '--offset DAYS', 'Start day from which to start fetching.', "(default: #{o.offset})", Integer ) do |days| o.offset = days end o.output = nil opt.define( '--output-file FILE', '(N.B. Filled in by mythfilldatabase.)' ) do |file| o.output = file end o.quiet = false opt.define( '--quiet', 'Suppress warnings and errors.' ) do |quiet| o.quiet = quiet end o.ratings = false opt.define( '--[no-]ratings [DIR]', 'Attempt to dynamically rate films via IMDB', 'and cache results (in DIR, if given).', "(default: #{o.ratings})" ) do |ratings| if ratings.nil? o.ratings = File.join( Grabber::WORKING_DIR, Rating::RATINGS_CACHE_FILE ) elsif ratings o.ratings = File.join( File.expand_path( ratings ), Rating::RATINGS_CACHE_FILE ) end end o.sanity = false opt.define( '--sanity-check', 'Perform sanity checks before pulling data.' ) do |sanity| o.sanity = sanity end opt.define( '--schema SCHEMA', [ '1', '2' ], 'Choose which programme schema to use.', '(This option no longer has any effect.)', Integer ) do |schema| end o.sleep = 1.0 opt.define( '--sleep SECONDS', 'Sleep this many seconds between each page', "fetch. (default: #{o.sleep})", Float ) do |secs| o.sleep = secs end o.static_ratings = false opt.define( '--static-ratings', 'Attempt to statically rate films via IMDB.', "(default: #{o.static_ratings})" ) do |ratings| o.static_ratings = ratings end o.threads = false opt.define( '--threads', 'Use threads to fetch channel data in', 'parallel. (experimental) ' + "(default: #{o.threads})" ) do |threads| o.threads = threads end o.tries = 3 opt.define( '--tries TRIES', 'Number of times to try each page fetch.', "(default: #{o.tries})", Integer ) do |tries| o.tries = tries end o.verbose = false opt.define( '--verbose', 'Print informational messages. Verbosity', 'can be dynamically toggled with SIGUSR2.' ) do |verbose| o.verbose = verbose end opt.define( '--version', 'Display program version and exit.' ) do puts '%s v%s (C) 2006-2007 Ian Macdonald' % [ Grabber::PROGRAM, Grabber::VERSION ] exit end o.xmltvid_suffix = '.chello.nl' opt.define( '--xmltvid-suffix STRING', 'Suffix to add to channel number to form', 'XMLTV ID. (default: .chello.nl)' ) do |suf| o.xmltvid_suffix = suf end end options = op.parse!( args ) return o end end BASE_URL = 'http://epg.upc.nl' # Category translations. # CATEGORIES = { 'Actie' => 'Action', 'Algemeen' => 'Misc', 'Atletiek' => 'Sports', 'Avontuur' => 'Action', 'Beeldende kunst' => 'Arts/Culture', 'Detective' => 'Crime/Mystery', 'Documentaire' => 'Documentary', 'Drama' => 'Drama', 'Educatie' => 'Educational', 'Erotiek' => 'Adult', 'Extreme' => 'Sports', 'Gevechtssport' => 'Sports', 'Gezondheid' => 'Health/Medical', 'Historisch' => 'History', 'Kids / jeugd' => 'Children', 'Kids/jeugd' => 'Children', 'Klussen' => 'HowTo', 'Koken' => 'Food', 'Komedie' => 'Comedy', 'Kunst / cultuur' => 'Arts/Culture', 'Kunst/cultuur' => 'Arts/Culture', 'Lifestyle' => 'Educational', 'Melodrama' => 'Soaps', 'Militair' => 'War', 'Mode' => 'Educational', 'Motorsport' => 'Sports', 'Musical' => 'Movies', 'Muziek' => 'Art/Music', 'Nieuws' => 'News', 'Paardensport' => 'Sports', 'Religie' => 'Spiritual', 'Romantiek' => 'Romance', 'Sci-fi' => 'SciFi/Fantasy', 'Show' => 'Game', 'Show/spelshow' => 'Game', 'Speelfilm' => 'Movies', 'Sport' => 'Sports', 'Sportmagazine' => 'Sports', 'Talkshow' => 'Talk', 'Teamsporten' => 'Sports', 'Technologie' => 'Science/Nature', 'Tekenfilms' => 'Children', 'Tennis / squash' => 'Sports', 'Theater / dans' => 'Arts/Culture', 'Thriller' => 'Crime/Mystery', 'Tuinieren' => 'HowTo', 'Voetbal' => 'Sports', 'Watersport' => 'Sports', 'Wintersport' => 'Sports', 'Vrije tijd' => 'Educational' } def pre_checks( options ) # Check that we're not running as root. # if Process.euid == 0 $stderr.puts 'This program should not be run as root. Aborting...' exit 6 end # Check for correct locale. Character classes in regular expressions behave # differently, depending on the locale. # if ( lang = ENV['LANG'] ) != 'nl_NL' lang = 'unset' if lang == '' || lang.nil? unless options.quiet $stderr.puts \ 'Warning: Your locale ($LANG) is %s, but nl_NL is recommended.' % [ lang ] end end end def get_page(url, req, options) excepts = 0 res = nil loop do begin res = Net::HTTP.start( url.host, url.port ) do |http| http.request( req ) end break unless res.key?( 'location' ) # If we get an HTTP 3xx redirect, we need to follow it. # if options.debug $stderr.puts ' Following HTTP %s redirection...' % [ res.code ] end url = URI.parse( res['location'] ) req = Net::HTTP::Get.new( url.path ) redo rescue EOFError, Errno::ECONNREFUSED, Errno::ECONNRESET, Timeout::Error => e if ( excepts += 1 ) < options.tries unless options.quiet $stderr.puts " '%s' whilst fetching page. Retry #%d..." % [ e, excepts ] end retry end return nil end end res.body end def get_available_channels(options) url = URI.parse( BASE_URL + '/Entertainment/TV_gids/Vergelijk_zenders/Favorieten/' ) req = Net::HTTP::Get.new( url.path ) page = get_page( url, req, options ) if page.nil? unless options.quiet $stderr.puts 'Failed to fetch channel list from UPC. Aborting...' end exit 5 end # Create an array of channel names. # # Channels are recognisable on the page, because the literal name appears # twice, according to the following example: # # # # Unfortunately, the exception to this is: # # # # so we have to have a fuzzy match for those, too. # upc_channels = page.scan( /