#!/usr/bin/ruby -w require 'optparse' require 'ostruct' require 'net/http' require 'uri' require 'rexml/document' include REXML NOW = Time.now ONE_DAY = 86400 class Grabber PROGRAM = $0.sub( /^.*\//, '' ) VERSION = '1.8.3' RELEASE = '$Id: tv_grab_nl_upc,v 1.212 2010/03/26 09:34:05 ianmacd Exp $' GUIDE = 'http://tvgids.upc.nl/TV/' DESC = 'UPC Cable TV, The Netherlands (%s)' % [ GUIDE ] URL = 'http://www.caliban.org/ruby/xmltv_upc.shtml' CAPABILITIES = %w[ baseline manualconfig ] WORKING_DIR = File.expand_path( '~/.xmltv' ) BASE_URL = 'http://tvgids.upc.nl' BACKOFF_INCREMENT = 0.20 # Checks to ensure we're in a sane environment before we do anything else. # def Grabber.pre_checks(options) # Check that we're not running as root. # if Process.euid == 0 $stderr.puts 'This program should not be run as root. Aborting...' exit 6 end # Check for correct locale. Character classes in regular expressions behave # differently, depending on the locale. # if ( lang = ENV['LANG'] ) != 'nl_NL' lang = 'unset' if lang.empty? || lang.nil? unless options.quiet $stderr.puts \ 'Warning: Your locale ($LANG) is %s, but nl_NL is recommended.' % [ lang ] end end # Check for a local time zone that is different from the Netherlands, # because programme start and finish times will be wrong. # if NOW.zone !~ /^CES?T$/ && ! options.quiet $stderr.puts \ 'Warning: Your time zone is %s, not CET/CEST, so programme start and '\ "end times\n will probably be wrong." % [ NOW.zone ] end end # Create a directory, aborting the programme if the given path already exists # as some other file type. # def Grabber.create_directory(dir, options) if File.exist?( dir ) return if File.directory?( dir ) $stderr.puts '%s exists, but is not a directory.' % [ dir ] exit 7 end require 'fileutils' $stderr.puts 'Creating %s...' % [ dir ] if options.verbose FileUtils.mkdir_p( dir ) end # Build a grabber channel config file from scratch. # def Grabber.configure(options) $stderr.puts 'Using config file %s.' % [ options.config ] # Create config file directory if necessary. # config_dir = File.dirname( options.config ) Grabber.create_directory( config_dir, options ) if File.exist?( options.config ) unless File.file?( options.config ) $stderr.puts '%s exists, but is not a regular file.' % [ options.config ] exit 7 end # Deal with the case of an existing, non-zero byte config file. # unless File.zero?( options.config ) print <<"EOF" The file #{options.config} already exists and is not empty. There is currently no support for altering an existing configuration, so you will have to reconfigure from scratch. Do you wish to overwrite the old configuration? [yes, no (default = no)] ? EOF loop do case answer = $stdin.gets when /^(y|ye|yes)$/i break when /^(no?|\n)$/i # 'No' is the default. puts 'Configuration aborted.' return else puts "Invalid response. Please choose either 'yes' or 'no'." redo end end end end opts = options.clone opts.verbose = false opts.quiet = false $stderr.puts 'Obtaining list of channel names and numbers...' channels = Channel.get_available( opts ) # Write a new configuration file. # count = 0 File.open( options.config, 'w' ) do |f| all = false # Work through channels in ascending channel number order. # channels.sort { |a, b| a[0] <=> b[0] }.each do |num, name| unless all print 'Add channel %s (%s) [yes, no, all, none (default = yes)] ? ' % [ name, num ] case answer = $stdin.gets when /^(y|ye|yes|\n)$/i when /^no$/i next when /^(a|al|all)$/i all = true when /^none?$/i break else puts "Invalid response. Please choose between 'yes', 'no', 'all' and 'none'." redo end end f.puts 'channel %s %s' % [ num, name ] count += 1 end end puts 'Finished configuration of %d channel(s).' % [ count ] end # Read the grabber's configuration file. # def Grabber.read_config(file, verbose) $stderr.puts 'Parsing channel file...' if verbose channels = Hash.new file.each do |line| next if line =~ /^(#|$)/ number, name = line.match( /^(?i:channel )?(\d+)\s+(.+)$/ )[1..2] channel = Channel.new( number.to_i, name.rstrip ) channels[channel.number] = channel end if verbose $stderr.puts '%d channel(s) found in configuration file.' % [ channels.size ] end channels end # Fetch a page of HTML. # def Grabber.get_page(url, req, options) excepts = 0 res = nil loop do begin if options.verbose uri = url.path uri << '?' << url.query if url.query $stderr.puts ' Fetching %s ...' % [ uri ] end res = Net::HTTP.start( url.host, url.port ) do |http| req['User-Agent'] = options.user_agent http.request( req ) end # Be nice (or not) to the server. # if options.verbose $stderr.puts " Sleeping %.2f second(s)...\n\n" % [ options.sleep ] end sleep options.sleep if res.code == '503' && res.body == 'Please slow down' unless options.quiet $stderr.puts " Warning: UPC is throttling me. Increasing sleep time to %.2f seconds..." % [ options.sleep + BACKOFF_INCREMENT ] end options.sleep += BACKOFF_INCREMENT redo end break unless res.key?( 'location' ) # If we get an HTTP 3xx redirect, we need to follow it. # if options.debug $stderr.puts ' Following HTTP %s redirection...' % [ res.code ] end url = URI.parse( res['location'] ) req = Net::HTTP::Get.new( url.path ) redo rescue EOFError, Errno::ECONNREFUSED, Errno::ECONNRESET, Errno::ETIMEDOUT, Timeout::Error => e if ( excepts += 1 ) < options.tries unless options.quiet $stderr.puts " '%s' whilst fetching page. Retry #%d..." % [ e, excepts ] end retry end return nil end end return nil if res.code != '200' res.body end # URL-encode a string for use as a URL path. # def Grabber.url_encode(string) string.gsub( /([^ a-zA-Z0-9_.-]+)/ ) do '%' + $1.unpack( 'H2' * $1.bytesize ).join( '%' ).upcase end.tr( ' ', '+' ) end end # A class for film ratings. # class Rating IMDB_URL = 'http://uk.imdb.com' RATINGS_FILE = File.join( Grabber::WORKING_DIR, 'ratings.list' ) RATINGS_FILE_GZ = File.join( Grabber::WORKING_DIR, 'ratings.list.gz' ) CACHE_FILE = RUBY_VERSION >= '1.9.0' ? 'rating.cache' : 'ratings_cache.yaml' if RUBY_VERSION >= '1.9.0' SERIAL = Marshal else require 'yaml' SERIAL = YAML end @@rated = 0 @@not_rated = 0 @@cache = {} @@cache_positives = 0 @@cache_negatives = 0 @@ratings = {} attr_reader :timestamp attr_accessor :rating, :ctime, :mtime # Obtain the film ratings file from IMDB. # def Rating.get_ratings_list(options) # Download the ratings file if it doesn't exist or is older than 7 days. # See http://www.imdb.com/interfaces#plain for more information. # if ! File.exist?( RATINGS_FILE ) || ( NOW - File.mtime( RATINGS_FILE ) ) / ONE_DAY > 7 if options.verbose $stderr.puts 'Attempting to download IMDB ratings list via FTP...' end require 'net/ftp' Net::FTP.open( 'ftp.funet.fi' ) do |ftp| ftp.debug_mode = true if options.debug ftp.login ftp.chdir( '/pub/mirrors/ftp.imdb.com/pub' ) ftp.passive = true ftp.getbinaryfile( 'ratings.list.gz', RATINGS_FILE_GZ, 1024 ) end require 'shell' begin gunzip = Shell::CommandProcessor.new( Shell.new ). find_system_command( 'gunzip' ) # If we don't have gunzip in our PATH, an exception will be raised. # rescue Shell::Error::CommandNotFound $stderr.puts 'Please gunzip %s and then rerun.' % [ RATINGS_FILE_GZ ] exit end # Decompress ratings file. # $stderr.puts 'Decompressing ratings file...' if options.verbose system( gunzip, '-f', RATINGS_FILE_GZ ) end end # Cache a film rating, if found, or the fact that one wasn't found. # def Rating.cache_rating(title, options, rating) if rating $stderr.puts ' Succeeded: %s.' % [ rating ] if options.verbose # Cache the result and return a rating, e.g. 7/10. # @@cache[title] = Rating.new( rating ) @@rated += 1 @@cache[title].rating else $stderr.puts ' Failed.' if options.verbose # Cache a negative result and return nil. # @@cache[title] = Rating.new( false ) @@not_rated += 1 nil end end # Look up a film's rating on IMDB. # def Rating.imdb_rating(title, options) # Titles come from UPC as UTF-8, so we check for any accented alpha # characters in the title and, if we find them, convert the title to # Latin-1 (ISO-8859-1). # if RUBY_VERSION >= '1.9.0' title = title.clone.encode( 'ISO-8859-1' ) elsif title =~ /\xC3[\x80-\xBF]/ title = title.unpack( 'U*' ).pack( 'c*' ) end if options.verbose $stderr.puts ' Attempting to rate "%s"...' % [ title ] end if @@cache.key?( title ) # Update timestamp on cache entry. # @@cache[title].mtime = Time.now if @@cache[title].rating if options.verbose $stderr.puts ' Found in cache: %s.' % [ @@cache[title].rating ] end @@cache_positives += 1 return @@cache[title].rating elsif ! @@cache[title].rating $stderr.puts ' Negative result found in cache.' if options.verbose unless options.ignore_negatives @@cache_negatives += 1 return end $stderr.puts ' Ignoring cached negative...' if options.verbose end end if options.static_ratings rating = nil # if we're using threads, we have to wait for the first thread to # populate the ratings hash before we allow other threads to consult it. # options.threads.lock if options.threads # If static ratings are desired, create the hash of titles and ratings # from the static IMDB ratings file. # if @@ratings.empty? $stderr.puts ' Reading ratings file...' if options.verbose File.open( RATINGS_FILE ) do |f| rating = nil body = false f.each_line do |l| l.force_encoding( 'ISO-8859-1' ) if RUBY_VERSION > '1.9.0' # Get to the right point in the file. # body = true if l =~ /^MOVIE RATINGS REPORT/ next unless body if m = l.match( /^\s+\S{10}\s+\d+\s+(\d+\.\d+)\s+(.+)$/ ) # Remove year and any other trailing stuff from film title and # convert it to lower case. # t = m[2].sub( /\s*\(\d+\).*$/, '' ).downcase # Store the rating. # @@ratings[t] = m[1] end end if options.verbose $stderr.puts ' Read %d ratings.' % [ @@ratings.size ] end end end # It's now safe to access the ratings hash. # options.threads.unlock if options.threads l_title = title.downcase rating = @@ratings[l_title] + '/10' if @@ratings.key?( l_title ) return Rating.cache_rating( title, options, rating ) end # Otherwise, grab the rating dynamically from IMDB. # guide_title = title.clone res = nil rating_regex = /([\d\.]+\/\d+)<\/b> (?:.{0,64}]*>[\d,]+\svotes<\/a>)/mx loop do # When parsing the URL, we have to remove any ampersand entities in the # title. # url = URI.parse( IMDB_URL + '/find?q=%s;s=tt' % [ Grabber.url_encode( title.gsub( /&/, '&' ) ) ] ) req = Net::HTTP::Get.new( url.path + '?' + url.query ) return nil unless res = Grabber.get_page( url, req, options ) # A modified version of the title will be used for matching. # # Firstly, any occurrence of '&' in the title is changed to a literal # ampersand. # # Next, any non-alphanumeric characters are made optional, because these # may not be used to form the title in the IMDB database. The IMDB HTML, # may, however, use numeric entities to represent them, so we also have # to match against the possibility of those. This is further complicated # by the fact that the numeric entity may be decimal or hexadecimal. # # All ampersands and accented characters (ASCII A0 - FF) are then # allowed to match against their alphabetic or numeric HTML entity # equivalent. # # Finally, the word 'and' in the title is allowed to also match '&' # or its (hexa)decimal entity equivalent. # # Note that the # character must be escaped, as we will be interpolating # it into the /x style regex that follows, where it would otherwise be # treated as a comment character. # # We must also refer to the ampersand as \x26 in the second gsub's # substitution string, because a literal ampersand would be replaced by # the third gsub. # title.force_encoding( 'ASCII-8BIT' ) if RUBY_VERSION >= '1.9.0' mod_title = title.gsub( /&/, '&' ). gsub( /[^&[:alnum:]\xA0-\xFF]/, '(?:.?|\x26\#(x[[:xdigit:]]+|\d+);)' ). gsub( /[&\xA0-\xFF]/, '&(?:\#(x[[:xdigit:]]+|\d+)|[[:alpha:]]+);' ). gsub( /and/i, '(?:and|&\#(x26|40);|&)' ) if m = res.match( / # Most film titles return 'Popular Titles', but occasionally one # will return just 'Exact Matches' (or even 'Approx Matches', which # we ignore). #

(?:Popular\sTitles | Titles\s\(Exact\sMatches\) ) <\/b>\s+ # Now we can grab the URL path of the first film in the list, which # may not actually be the right one if the film has been remade. # \(Displaying\s\d+\sResults?\).+?]*> # But we must also make sure that we really did match the same title, # not a substring of some other title. To achieve a slightly fuzzy # match, we match case-insensitively, using an embedded pattern-match # modifier (i.e. (?i:pattern) ) and a modified version of the title. # # This can be done, because the interpolation of the result from the # code below takes place before the regex is matched. # (?: # At this point, we can match either directly on the title: # (?i:#{mod_title})<\/a> | # Or the film may have originally had a foreign title, in which # case it may be followed by an 'a.k.a.' line with the translated # title that we're looking for: # [^<]+<\/a>[^<]+
.+?aka\s"(?i:#{mod_title})"<\/em> ) /x ) # Now we follow the URL path to get the ratings page. # url = URI.parse( IMDB_URL + m[1] ) req = Net::HTTP::Get.new( url.path ) return nil unless res = Grabber.get_page( url, req, options ) # Extract the rating from the page. # break elsif res !~ rating_regex && title.sub!( /, (A|The)$/, '' ) # If we haven't just fetched the film's ratings page and its title # ends with ', A' or ', The', we can try again with this string at the # beginning of the title instead. # title = $1 + ' ' + title if options.debug $stderr.puts ' Reattempting as "%s"...' % [ title ] end redo else # We must be on the ratings page. # break end end # At this point, we should have a page that contains a rating for this # film. # m = res.match( rating_regex ) Rating.cache_rating( guide_title, options, m ? m[1] : nil ) end # Load the film rating cache from disc. # def Rating.load_cache(options) if File.exist?( options.ratings ) $stderr.puts 'Loading ratings cache file...' if options.verbose @@cache = File.open( options.ratings ) { |f| SERIAL.load( f ) } || {} # Expire any cache entries older than a week. # orig_size = @@cache.size positive = 0 negative = 0 $stderr.puts 'Removing expired ratings cache entries...' if options.debug @@cache.delete_if do |title, entry| # Delete on this basis: # if ( NOW - entry.mtime ) / ONE_DAY > 7 true else # Or count the positive/negative cache entries. # entry.rating ? positive += 1 : negative += 1 false end end if options.verbose $stderr.puts '%d rating(s) expired. %d remaining, of which %d positive and %d negative.' % [ orig_size - @@cache.size, @@cache.size, positive, negative ] end else $stderr.puts 'No ratings cache file found.' end end # Dump the film ratings cache to disc. # def Rating.dump_cache(options) $stderr.puts 'Dumping ratings cache file...' if options.verbose File.open( options.ratings, 'w' ) { |f| SERIAL.dump( @@cache, f ) } if options.verbose $stderr.puts '%d ratings cache entries written.' % [ @@cache.size ] end end def Rating.rated @@rated end def Rating.not_rated @@not_rated end def Rating.cache_positives @@cache_positives end def Rating.cache_negatives @@cache_negatives end def initialize(rating) @rating = rating @ctime = @mtime = Time.now end end # A class for television programmes. # class Programme # Category translations. # CATEGORIES = { 'actie' => 'Action', 'algemeen' => 'Misc', 'atletiek' => 'Sports', 'avontuur' => 'Action', 'beeldende kunst' => 'Arts/Culture', 'detective' => 'Crime/Mystery', 'documentaire' => 'Documentary', 'drama' => 'Drama', 'educatie' => 'Educational', 'erotiek' => 'Adult', 'extreme' => 'Sports', 'gevechtssport' => 'Sports', 'gezondheid' => 'Health/Medical', 'historisch' => 'History', 'kids / jeugd' => 'Children', 'kids/jeugd' => 'Children', 'Klussen' => 'HowTo', 'koken' => 'Food', 'komedie' => 'Comedy', 'kunst / cultuur' => 'Arts/Culture', 'kunst/cultuur' => 'Arts/Culture', 'lifestyle' => 'Educational', 'melodrama' => 'Soaps', 'militair' => 'War', 'mode' => 'Educational', 'motorsport' => 'Sports', 'musical' => 'Movies', 'muziek' => 'Art/Music', 'nieuws' => 'News', 'paardensport' => 'Sports', 'religie' => 'Spiritual', 'romantiek' => 'Romance', 'sci-fi' => 'SciFi/Fantasy', 'show' => 'Game', 'show/spelshow' => 'Game', 'speelfilm' => 'Movies', 'sport' => 'Sports', 'sportmagazine' => 'Sports', 'talkshow' => 'Talk', 'teamsporten' => 'Sports', 'technologie' => 'Science/Nature', 'tekenfilms' => 'Children', 'tennis / squash' => 'Sports', 'theater / dans' => 'Arts/Culture', 'thriller' => 'Crime/Mystery', 'tuinieren' => 'HowTo', 'voetbal' => 'Sports', 'watersport' => 'Sports', 'wintersport' => 'Sports', 'vrije tijd' => 'Educational' } CACHE_FILE = 'programme.cache' LITERAL_AMPERSAND = /&(?!amp;)/ @@cache = {} @@cached = 0 attr_reader :actors, :category, :desc, :directors, :episode, :presenters, :rating, :subcategory, :subtitle, :time, :title attr_accessor :start_time, :stop_time # Load the programme cache from disc. # def Programme.load_cache(options) if File.exist?( options.cache ) $stderr.puts 'Loading programme cache file...' if options.verbose @@cache = File.open( options.cache ) { |f| Marshal.load( f ) } || {} if options.verbose $stderr.puts '%d programme cache entries found.' % [ @@cache.size ] end else $stderr.puts 'No programme cache file found.' end end # Dump the programme cache to disc. # def Programme.dump_cache(options) orig_size = @@cache.size # Expire any cache entries that end earlier than midnight today. # if options.debug $stderr.puts 'Removing expired programme cache entries...' end @@cache.delete_if do |prog_id, programme| programme.stop_time < NOW - ( NOW.to_i % ONE_DAY ) end if options.verbose $stderr.puts '%d cached programmes(s) expired and %d remaining.' % [ orig_size - @@cache.size, @@cache.size ] end $stderr.puts 'Dumping programme cache file...' if options.verbose File.open( options.cache, 'w' ) { |f| Marshal.dump( @@cache, f ) } if options.verbose $stderr.puts '%d programme(s) written to cache.' % [ @@cache.size ] end end # Obtain a list of a day' television programmes from a TV guide page. # def Programme.get_programme_list(guide) return [] if guide.nil? # Get start time, EID (unique programme ID) and the programme URL path. # guide.scan( /\s+\s+ (\d+:\d+)<\/span>\s+ <\/th>\s+ \s+ /mx ).collect { |prog| Programme.create( *prog ) } end # Instantiate a new Programme object or return an existing one if the # programme has been cached. # def Programme.create(start_time, prog_id, url_path) if class_variable_defined?( :@@cache ) && @@cache.key?( prog_id ) return @@cache[prog_id] end new( prog_id, start_time, url_path ) end def Programme.cached @@cached end def initialize(prog_id, start_time, url_path) @prog_id = prog_id @time = start_time @url_path = url_path @presenters = [] @directors = [] @actors = [] @rating = nil end def cache @@cache[@prog_id] = self @@cached += 1 end def cached? @@cache.key?( @prog_id ) end # Obtain the title, broadcast time, description and category of a programme. # def get_detail(timescope, options) fetched = 0 url = URI.parse( Grabber::BASE_URL + @url_path ) req = Net::HTTP::Get.new( url.path ) return fetched unless detail = Grabber.get_page( url, req, options ) # Needed for Ruby 1.9.x. # detail.force_encoding( 'UTF-8' ) if RUBY_VERSION >= '1.9.0' begin @title, @desc, @category, subcategory, duration = detail.match( /

([^<]+)<\/h3>.+ # Title \s+ ([^<]+) # Description <\/div> .+?
Genre:<\/dt>
([^<]+)<\/dd>\s+ # Category
Subgenre:<\/dt>
([^<]+)<\/dd>.+? # Subcategory
Duur:<\/dt>
(\d+)\smin<\/dd> # Duration /mx )[1..5] # Description will be right-padded with spaces. # @desc.rstrip! rescue NoMethodError # We probably tried to call [] on NilClass, which would mean that our # regex didn't match the program details. The cause is probably that a # page of Web server errors was returned instead of the expected page. # # On the other hand, the programme may be missing a duration. # unless detail =~ /
Duur:<\/dt>
(\d+)\smin<\/dd>/ unless options.quiet $stderr.puts 'Warning: Programme has no end time or duration. Ignoring...' end return 0 end if options.ignore_errors unless options.quiet $stderr.puts 'Warning: Ignoring a programme data error...' end return 0 end $stderr.puts "Error while scanning page for program details. Page was:\n\n" $stderr.puts detail $stderr.puts "\nPlease include this text when reporting a problem." exit 9 end fetched += 1 if m = detail.match( / (#{@time}\s+-\s+\d+:\d+) /x ) @time = m[1] else if options.debug $stderr.puts ' Broadcast unknown on programme detail page.', ' End time calculated from start time and duration.' end # If we reached here, it means that this particular broadcast isn't # known on the programme page referred to by the channel's day # programming page. # # That means we can't parse the end time of the broadcast, so we'll # have to calculate it from the start time and duration. # h, m = @time.split( ':' ) m = m.to_i + duration.to_i h = ( h.to_i + m / 60 ) % 24 m %= 60 @time = '%s - %02d:%02d' % [ @time, h, m ] end if options.debug $stderr.puts <<"EOF" Title: #{@title} Time: #{@time} Category: #{@category} EOF end unless @desc.empty? # Check for directors. # if m = detail.match( /
Regie:<\/dt>
([^<]+)<\/dd>/ ) @directors = m[1].split( ', ' ) if options.debug $stderr.puts " Directors: #{@directors.inspect.delete( '"' )}" end end # Check for actors. # if m = detail.match( /
(?:Cast|Met):<\/dt>
([^<]+)<\/dd>/ ) @actors = m[1].split( ', ' ) if options.debug $stderr.puts " Actors: #{@actors.inspect.delete( '"' )}" end end # Check for censor board rating. # if m = detail.match( /
(?:Kijkwijzer|Leeftijd):<\/dt>
([^<]+)<\/dd>/ ) @rating = m[1] if options.debug $stderr.puts " Censor board rating: #{@rating}" end end end # Can we figure out whether there's a presenter? # # We isolate from 'Presented by ' to the first full-stop. # # The 'pres != desc' check is required in order to know whether # substitution actually occurred. # if ( pres = @desc.sub( /^.*(?:(?:Presented|Hosted) by|Gepresenteerd door) (.+?)\..*$/i, "\\1" ) ) && pres != @desc # Grab each string of consecutive words that start with a capital # letter. That should give us an array of names, each of which # possibly has a trailing space. # @presenters = pres.scan( /((?:[[:upper:]](?:'[[:upper:]])?[[:lower:]]+ ?)+)/ ). flatten if options.debug $stderr.puts " Presenters: #{presenters.inspect.delete( '"' )}" end # FIXME: Not sure if this is needed any more. # @presenters.each { |presenter| presenter.rstrip! } end fetched end # Derive subtitle, episode number, etc. for a programme. # def derive_fields(options) # Remove any unhelpful text from the end of the description. # #@desc.sub!( / ?Niet beschikbaar$/, '' ) # If the title features a colon, make the left-hand side the title and the # right-hand side the subtitle, except when the colon appears to be the # separator in a time, i.e. HH:MM. Otherwise, keep the title as is. # @title, @subtitle = @title.split( /:+\s*(?!\d)\s*/, 2 ) unless @subtitle # We still don't have a subtitle, so check whether the first sentence of # the description contains only words that begin with a capital letter. # # We also allow digits and punctuation, so that we can recognise cardinal # and ordinal numbers, complex sentences, names containing initials, etc. # # The desc.match pattern is a short-circuit operation that quickly # enables us to determine whether the description can possibly contain a # usable description. This saves time by not applying the complex regex # unless necessary. Otherwise, there's a chance the program will get # stuck performing exponential backtracking. # # An example of a problematic description that would cause the program to # get stuck is the following: # # 1300BST: US PGA Tour Golf, 1400BST: Challenge Series Golf, 1530BST: WTA # Tennis, 1600BST: ICC Cricket, 1630BST: ATP Tennis. # # Needed for Ruby 1.9.x. # @desc.force_encoding( 'ASCII-8BIT' ) if RUBY_VERSION >= '1.9.0' if @desc.match( /[.!?]+\s+/ ) && @subtitle = @desc.match( /^( (?: (?: # Start with a single digit or capital letter... # [\d[:upper:]] (?: (?: # ...followed by zero or more of: # # - alphanumeric characters # - various punctuation characters, or... # [[:alpha:]\d"\#$%'),\-\/:;\]\}] | # - a UTF-8 accented alpha character. See: # www.utf8-chartable.de unicode-utf8-table.pl # (?:\xC3[\x80-\xBF]) )* (?: # Alternatively, this may catch Ep. 27 style episode # numbering... # \.?\s\d+ )? | # ...and this might catch initials in names. # \.? ) | (?: # If it's not a word, these are the likely word # separators... # [\s&(*+\-<=>\[{] | # or we may find some HTML entities, lower-case articles, # conjunctions or prepositions... # # ...either in English... # &|'n'|a|an|and|for|in|of|on|or|the|to| # ...or in Dutch: # de|en|het|naar|tot|van|voor|vs )+ ) )+ # End with one or more full-stops, exclamation marks or question # marks, followed by one or more spaces. # [.!?]+\s+ ) /x ) # Remove the subtitle from the start of the description. # @desc.slice!( @subtitle[1] ) @subtitle = @subtitle[1].rstrip end end if @subtitle # Sometimes, the subtitle will simply duplicate the programme title # (except, perhaps, for a trailing full-stop). In this case, we abandon # the subtitle. # pruned_sub = @subtitle.chomp( '.' ) if pruned_sub == @title @subtitle = nil # Prune any trailing full-stop from subtitle if requested, but be careful # not to prune the last dot of an ellipsis. # elsif options.prune_subs && pruned_sub.chomp( '.' ) == pruned_sub @subtitle = pruned_sub end end # Detect trailing episode number in the description and use this as # subtitle if available. If we already have a subtitle, append the episode # number to increase the likelihood of uniqueness. # if @episode = @desc.match( /( (?:deel|part|episode) (\d+)\.?)$/i ) @desc.slice!( @episode[1] ) @subtitle = ( ( @subtitle || '' ) + @episode[1] ).lstrip # Remember just the episode number. # @episode = @episode[2].to_i end @title.gsub!( LITERAL_AMPERSAND, '&' ) @subtitle.gsub!( LITERAL_AMPERSAND, '&' ) if @subtitle @desc.gsub!( LITERAL_AMPERSAND, '&' ) if RUBY_VERSION >= '1.9.0' @title.force_encoding( 'UTF-8' ) @subtitle.force_encoding( 'UTF-8' ) if @subtitle @desc.force_encoding( 'UTF-8' ) end end end # A class for television channels. # class Channel attr_reader :number, :name # Obtain the list of available TV channels from UPC. # def Channel.get_available(options) url = URI.parse( Grabber::BASE_URL + '/TV/wa/grid/?startDateTime=%s' % [ NOW.strftime( '%Y-%m-%dT%H:%M:%SZ' ) ] ) req = Net::HTTP::Get.new( url.path + '?' + url.query ) page = Grabber.get_page( url, req, options ) if page.nil? unless options.quiet $stderr.puts 'Failed to fetch channel list from UPC. Aborting...' end exit 5 end # Needed for Ruby 1.9.x. # page.force_encoding( 'UTF-8' ) if RUBY_VERSION >= '1.9.0' # Create an array of channel names. # # where 'n' is the channel number. # upc_channels = {} # Replace any ampersand entities with literal ampersands and slashes with # backslashes. # page.scan( /\s+ ([^<]+)<\/span> /mx ).collect do |num, name| # Names are right-padded with spaces. # name.rstrip! upc_channels[num.to_i] = name.gsub( /&/, '&' ).gsub( /\//, '\\' ) end if upc_channels.size.zero? unless options.quiet $stderr.puts 'No channels found at UPC. Aborting...' end exit 5 end if options.verbose $stderr.puts 'UPC nominally has data for %d channels.' % [ upc_channels.size ] if options.debug $stderr.puts "Those channels are:\n\n" upc_channels.sort do |a, b| # Sort on channel name, then channel number, because some sport # channels are identically named. # ( a[1].downcase <=> b[1].downcase ).nonzero? || ( a[0] <=> b[0] ) end.each do |num, name| $stderr.puts "%3d %s" % [ num, name ] end $stderr.puts end end upc_channels end def initialize(number, name) @number = number @name = name end # Check to see whether UPC recognises a given channel. # def check(upc_channels) if ! upc_channels.values.include?( @name ) $stderr.puts 'Warning: UPC may not know of the channel called %s.' % [ @name ] stub = @name.sub( /[^[:alpha:]].*$/, '' ) upc_channels.values.each do |upc_name| if stub.length > 0 && upc_name.index( stub ) $stderr.puts 'Perhaps you mean %s.' % [ upc_name ] end end end end # Obtain a TV guide page of programming for a given day. # def get_tv_guide(day, options) Guide.new( @name, day, options ) end # Determine the URL path to a logo for a given channel. # def icon_path(guide) guide.match( /