Picture of the Day

Keith Vetter - 2023-08-28: For a game I'm writing (coming soon), I needed access to a many pretty pictures so that I could replay the game with a different picture. I thought of using Flickr API but didn't want to deal with api keys and such.

Then I realized that both Wikipedia and Wikimedia Commons have decades worth of daily pretty pictures. I couldn't find any public API for downloading Pictures of the Day, so I decided to write my own.

This is pure web-scraping using the dom module and xpath queries. The web pages being scraped luckily seem to be stable, but the format is not entirely consistent: some PotD are multiple images, some are animations, some are videos and some have been deleted. I've tested this pretty thoroughly so hopefully I've handled all the different cases.

There's documentation in the file's header, plus demo code at the end that can be run from the command line. In the next few days I'll post some more code that runs a GUI downloading PotD images. Here it is: Picture of the Day Demo

##+##########################################################################
#
# wiki_potd.tcl -- downloads random Picture of the Day from Wikipedia or Wikimedia Commons
# by Keith Vetter 2023-08
#
# Wikipedia: https://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day/August_2023
# Wikimedia Commons: https://commons.wikimedia.org/wiki/Template:Potd/2023-08
#
# Two main functions and one auxiliary function:
#  ::POTD::GetPOTD service year month day ?fitness?
#  ::POTD::RandomPOTD service ?fitness?
#
#  ::POTD::SetLogger func  -- log messages will be sent to "$func $msg"
#

package require http
package require tls
http::register https 443 [list ::tls::socket -tls1 1]
package require tdom

namespace eval ::POTD {
    variable COMMONS_URL https://commons.wikimedia.org/wiki/Template:Potd
    variable PEDIA_URL https://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day
    variable USER_LOGGER ""

    variable month_url ""
    variable day_url ""
    variable meta [dict create]
    variable ERROR_STATUS 0
    variable NO_PICTURE_STATUS 1
    variable SUCCESS_STATUS 2

    variable SERVICES {"Wikipedia" "Commons"}
}

proc ::POTD::SetLogger {func} {
    # Installs custom logger: a function which takes one argument: msg
    set ::POTD::USER_LOGGER $func
}

proc ::POTD::GetPOTD {service year month day {fitness {}}} {
    # Gets POTD from Wikipedia or Wikimedia Commons for given date
    #  date range is 2004/11 - today
    #  fitness: optional tuple listing maxWidth and maxHeight,
    #           used to compute bestfit (see below in metadata)
    #
    # Returns two items:
    #   1. meta:
    #        service  : Wikipedia or Commons
    #        status   : 0 error, 1 no picture for this day, 2 success
    #        date     : YYYY/MM/DD
    #        desc     : description of the POTD
    #        month_url: url of the month POTD page
    #        day_url  : url of the specific day page
    #        bestfit  : if fitness given, index of the largest image smaller than fitness
    #        emsg     : error message if status == 0
    #   2. list of POTD image links {width height url}, sorted by increasing height
    #      maybe empty on days with no POTD

    variable month_url ""
    variable day_url ""
    variable meta [dict create]

    set n [catch {
        set results [::POTD::_GetPOTD $service $year $month $day $fitness]
    } emsg]

    if {$n} {
        set date $year/$month/$day
        set meta [dict create status $::POTD::ERROR_STATUS service $service \
                      date $date emsg $emsg month_url $month_url day_url $day_url \
                      desc "" bestfit -1]
        return [list $meta {}]
    }
    return $results
}
proc ::POTD::RandomPOTD {service {fitness {}}} {
    # Gets POTD from Wikipedia or Wikimedia Commons for a random date

    lassign [::POTD::_RandomDate $service] year month day
    return [::POTD::GetPOTD $service $year $month $day $fitness]
}

proc ::POTD::_GetPOTD {service year month day {fitness {}}} {
    # First gets the month POTD page, then extracts the day POTD page,
    # then extracts all resolutions of the POTD image

    variable month_url
    variable day_url

    set service [string totitle $service]
    if {$service ni $::POTD::SERVICES} {
        ::POTD::_Error "Unknown service '$service', should be either $::POTD::SERVICES"
    }

    set date $year/$month/$day
    ::POTD::_Logger "Getting $service Picture of the Day for $date"

    if {$service eq "Commons"} {
        set month_url [format "%s/%d-%02d#%d" $::POTD::COMMONS_URL $year $month $day]
    } else {
        set month_name [clock format [clock scan "2000-$month-24" -format %Y-%m-%d] -format %B]
        set month_url "${::POTD::PEDIA_URL}/${month_name}_$year#$day"
    }

    lassign [::POTD::_ExtractDayPage $month_url $year $month $day] day_url desc

    if {[file tail $day_url] eq "File:No_image.svg"} {
        set desc "No image available for $date"
        set meta [dict create date $date month_url $month_url day_url $day_url desc $desc \
                      status $::POTD::NO_PICTURE_STATUS bestfit -1 service $service emsg ""]
        return [list $meta {}]
    }

    set resolutions [::POTD::_ExtractResolutions $day_url]

    set bestfit -1
    if {$fitness ne {}} {
        lassign $fitness maxWidth maxHeight
        set bestfit [::POTD::_BestFit $resolutions $maxWidth $maxHeight]
    }

    set meta [dict create date $date month_url $month_url day_url $day_url desc $desc \
                  status $::POTD::SUCCESS_STATUS bestfit $bestfit service $service emsg ""]
    return [list $meta $resolutions]
}
proc ::POTD::_UrlHost {url} {
    # Extracts the host part of a URL

    set n [regexp {^https?://[^/]*} $url host]
    if {! $n} {
        ::POTD::_Error "Could not extract host part of '$url'"
    }
    return $host
}
proc ::POTD::_BestFit {resolutions maxWidth maxHeight} {
    # Returns index of largest image smaller than maxHeight and maxWidth

    set best -1
    foreach item $resolutions idx [::POTD::_range [llength $resolutions]] {
        lassign $item width height url
        if {$maxWidth > 0 && $width > $maxWidth} break
        if {$maxHeight > 0 && $height > $maxHeight} break
        set best $idx
    }
    return $best
}
proc ::POTD::_ExtractDayPage {month_url year month day} {
    # Scrape the potd template for the url to the correct day's image

    set date "$year/$month/$day"
    ::POTD::_Logger "Extracting day $day from monthly page"
    set html [::POTD::DownloadUrl $month_url]

    set n [catch {set dom [::dom parse -html $html]}]
    if {$n} {::POTD::_Error "Bad HTML: $emsg" }

    set ::xdom $dom
    # Some POTD are multiple images, with id="$day/#"
    set xpath "//*\[@id='$day/1'\]"
    set id_nodes [$dom selectNodes $xpath]
    if {$id_nodes eq {}} {
        set xpath "//*\[@id='$day'\]"
        set id_nodes [$dom selectNodes $xpath]
        if {$id_nodes eq {}} {
            ::POTD::_Error "could not find id tag for image page for $date"
        }
    }
    set id_node [lindex $id_nodes 0]
    if {[$id_node nodeName] eq "span"} {
        set id_node [$id_node parent]
    }

    set table [$id_node nextSibling]
    if {[$table nodeName] eq "style"} {
        set table [$table nextSibling]
    }
    if {[$table nodeName] eq "div"} {
        set table [$table selectNodes {.//table[1]}]
    }
    if {$table eq "" || [$table nodeName] ne "table"} {
        ::POTD::_Error "could not find table with img for $date"
    }

    set img [lindex [$table selectNodes {.//img}] 0]
    if {$img eq {}} {
        if {[$table selectNodes .//video] ne ""} {
            ::POTD::_Error "POTD for $date is a video"
        }
        if {[$table selectNodes {.//*[contains(text(),'File deleted')]}] ne ""} {
            ::POTD::_Error "POTD for $date has been deleted"
        }
        ::POTD::_Error "could not find img in old month page for $date"
    }
    set a_node [$img parent]
    if {[$a_node nodeName] ne "a"} { ::POTD::_Error "could not find <a> in old month page for $date" }
    set day_url [$a_node getAttribute href]
    set day_url [::POTD::_FixUrl $day_url [::POTD::_UrlHost $month_url]]

    # Get description if possible
    set desc "missing description"
    set fig_node [$table selectNodes .//figcaption\[1\]]
    if {$fig_node ne ""} {
        # Commons uses a <figcaption> node for the description
        set desc [$fig_node asText]
    } else {
        # Wikipedia: first <td> contains the image, second <td> description
        # except for some animations which is in the third <td>
        # Usually these <td> are in the same <tr> but sometimes not

        # set p_node [$table selectNodes {.//td[2]//p[1]}]
        set td_nodes [$table selectNodes .//td]
        if {[llength $td_nodes] >= 2} {
            set p_nodes [[lindex $td_nodes 1] selectNodes {.//p}]
            while {$p_nodes ne ""} {
                set desc [string trim [[lindex $p_nodes 0] asText]]
                if {$desc ne "View animation"} break
                set p_nodes [lrange $p_nodes 1 end]
            }

        }
    }
    return [list $day_url $desc]
}

proc ::POTD::_ExtractResolutions {day_url} {
    # Extracts links to images at all resolutions possible
    # each entry is {width height url}, sorted by height

    # NB. fails on older pages with "No higher resolution available"
    # NB. if page has a 1x1 image, then that image has been deleted

    ::POTD::_Logger "Extracting all image sizes from day page"

    set html [::POTD::DownloadUrl $day_url]
    set host [::POTD::_UrlHost $day_url]
    set n [catch {set dom [::dom parse -html $html]} emsg]
    if {$n} {::POTD::_Error "Bad HTML: $emsg" }

    set xpath {//*[contains(@class,"mw-thumbnail-link")]}
    set a_nodes [$dom selectNodes $xpath]

    if {[llength $a_nodes] == 0} {
        return [::POTD::_ExtractSingleResolution $dom $host]
    }

    set all {}
    foreach a_node $a_nodes {
        set url [$a_node getAttribute href]
        set size [$a_node asText]
        set size [regsub -all "," $size ""]
        set n [regexp {(\d+)\s+.\s+(\d+)} $size . width height]
        if {! $n} { ::POTD::_Error "could not extract size from '$size'" }
        lappend all [list $width $height [::POTD::_FixUrl $url $host]]
    }
    set all [lsort -index 1 -integer $all]
    ::POTD::_Logger "Found [::POTD::_Plural [llength $all] size]"
    return $all
}

proc ::POTD::_ExtractSingleResolution {dom host} {
    # Some POTD only have a single resolution with a different HTML scheme
    set all {}

    set xpath {//*[@id='file']//img[1]}
    set img [$dom selectNodes $xpath]
    if {$img ne ""} {
        set url [$img getAttribute src]
        set width [$img getAttribute width]
        set height [$img getAttribute height]
        lappend all [list $width $height [::POTD::_FixUrl $url $host]]
    }
    return $all
}

proc ::POTD::DownloadUrl {url} {
    # Downloads a given URL

    ::POTD::_Logger "downloading $url"
    set ::url $url
    set token [::http::geturl $url]
    set html [::http::data $token]
    set ncode [::http::ncode $token]
    ::http::cleanup $token

    if {$ncode != 200} {
        ::POTD::_Error "failed to download $url"
    }
    return $html
}
proc ::POTD::_Logger {msg} {
    # Logs message to screen or to custom logger

    variable USER_LOGGER
    if {$USER_LOGGER ne ""} {
        catch {$USER_LOGGER $msg}
    }
}

proc ::POTD::_Error {msg} {
    # Logs then throws an error

    catch {::POTD::_Logger $msg}
    error $msg
}
proc ::POTD::_RandomDate {service} {
    # Picks a random date in the range of available POTD images

    # Wikimedia Commons starts 2004/11
    # Wikipedia starts 2004/4 with funky HTML until 2007

    scan [clock format [clock seconds] -format "%Y %m %d"] {%d %d %d} YEAR MONTH DAY
    if {$service eq "Wikipedia"} {
        set years [::POTD::_range 2007 $YEAR+1]
    } else {
        set years [::POTD::_range 2004 $YEAR+1]
    }
    set year [::POTD::_lpick $years]

    if {$year == 2004} {
        set months {11 12}
    } elseif {$year == $YEAR} {
        set months [::POTD::_range 1 $MONTH+1]
    } else {
        set months [::POTD::_range 1 12+1]
    }
    set month [::POTD::_lpick $months]

    if {$year == $YEAR && $month == $MONTH} {
        set days [::POTD::_range 1 $DAY+1]
    } elseif {$year == 2004 && $month == 5} {
        setg days [::POTD::_range 14 31+1]
    } elseif {$month eq 2} {
        set days [::POTD::_range 1 29]
    } elseif {$month in {4 6 9 11}} {
        set days [::POTD::_range 1 31]
    } else {
        set days [::POTD::_range 1 32]
    }
    set day [::POTD::_lpick $days]
    return [list $year $month $day]
}

proc ::POTD::_lpick {llist} {
    # Selects random item from list
    set len [llength $llist]
    set idx [expr {int(rand() * $len)}]
    return [lindex $llist $idx]
}
proc ::POTD::_range {args} {
    # Akin to python's range command, except:
    # * accepts numbers of form a, a+b or a-b
    # * cannot handle downward ranges

    if {[llength $args] == 1} {
        lassign [concat 0 $args 1] lo hi step
    } elseif {[llength $args] == 2} {
        lassign [concat $args 1] lo hi step
    } elseif {[llength $args] == 3} {
        lassign $args lo hi step
    } else {
        error "Wrong number of arguments to ::POTD::_range: '$args'"
    }
    if {[regexp {^-?\d+[+-]\d+$} $lo]} { set lo [expr $lo] }
    if {[regexp {^-?\d+[+-]\d+$} $hi]} { set hi [expr $hi] }
    if {[regexp {^-?\d+[+-]\d+$} $step]} { set step [expr $step] }

    set result {}
    for {set idx $lo} {$idx < $hi} {incr idx $step} {
        lappend result $idx
    }
    return $result
}
proc ::POTD::_Plural {cnt single {multi ""}} {
    # Returns "1 dog" or "3 dogs"

    if {$cnt == 1} { return "$cnt $single" }
    if {$multi eq ""} { set multi "${single}s"}
    return "$cnt ${multi}"
}
proc ::POTD::_FixUrl {url host} {
    # Makes sure url has a host component
    if {[string match "//*" $url]} {
        set url [string cat "https:" $url]
    } elseif {[string match "/*" $url]} {
        set url [string cat $host $url]
    }
    return $url
}

#################################################################
#
# Demo code
#

if {[info exists argv0] && [file tail [info script]] eq [file tail $argv0]} {
    set maxWidth 1720   ;# .5 * screen width
    set maxHeight 1296  ;# .9 * screen height
    set fitness [list $maxWidth $maxHeight]

    set service [::POTD::_lpick $::POTD::SERVICES]

    # Parse command line with optional service and optional date
    lappend argv SENTINEL
    set cnt 0
    foreach arg $argv {
        if {[string totitle $arg] in $::POTD::SERVICES} {
            set service $arg
            continue
        }
        if {$arg eq "SENTINEL"} {
            if {$cnt > 0} break
            set arg "random"
        }

        incr cnt
        if {$arg eq "random"} {
            lassign [::POTD::RandomPOTD $service $fitness] meta all
        } else {
            set n [scan $arg %d%c%d%c%d year _ month _ day]
            if {$n != 5} { error "bad date format: '$arg', should be YYYY/MM/DD" }
            lassign [::POTD::GetPOTD $service $year $month $day $fitness] meta all
        }

        if {[dict get $meta status] == $::POTD::ERROR_STATUS} {
            set emsg [dict get $meta emsg]
            puts "ERROR: $emsg"
            return
        }

        set bestfit [dict get $meta bestfit]
        if {$bestfit == -1} {
            set desc [dict get $meta desc]
            puts "$desc"
            return
        }

        set url [lindex $all $bestfit 2]
        set img_data [::POTD::DownloadUrl $url]

        puts ""
        puts "Got [string totitle $service] POTD for [dict get $meta date]"
        puts ""
        puts "Metadata:"

        set longest [tcl::mathfunc::max 0 {*}[lmap key [dict keys $meta] {string length $key}]]
        foreach key [lsort -dictionary [dict keys $meta]] {
            puts [format "  %-${longest}s = %s" $key [dict get $meta $key]]
        }

        puts ""
        puts "Number of image sizes: [llength $all]"
        foreach item $all idx [::POTD::_range [llength $all]] {
            set star [expr {$idx == $bestfit ? "*" : " "}]
            lassign $item width height url
            puts [format " %s %4d x %4d" $star $width $height]
        }
        set date [dict get $meta date]
        scan $date "%d/%d/%d" year month day
        set fname [format "potd_%s_%4d_%02d_%02d.jpg" [string index $service 0] $year $month $day]
        set fout [open $fname wb]
        puts -nonewline $fout $::img_data
        close $fout
        puts ""
        puts "Saving to $fname"

        if {$tcl_platform(os) eq "Darwin"} {
            puts "exec open $fname"
            exec open $fname
        }
    }
}