#!/usr/bin/perl -w # Copyright (c) 2008-2010 Ilmari Karonen . # # Permission to use, copy, modify, and/or distribute this # software for any purpose with or without fee is hereby granted, # provided that the above copyright notice and this permission # notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL # WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL # THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM # LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # The following lines are for running this script as an SGE job: # #$ -N commons-mime-statistics #$ -S /usr/bin/perl #$ -o public_html/stats/commons_mime_statistics_log.txt -j y #$ -M vyznev@toolserver.org -m ea #$ -l sql-s4-rr=1 # Typical runtimes as of 2012 are under 30 mins, but it's been more in the past #$ -l h_rt=4:05:00 #$ -l s_rt=4:00:00 # Typical peak memory usage on Solaris seems to be around 22M, but Linux reports 200M for some reason??? #$ -l virtual_free=200M # This script should work on both Linux and Solaris #$ -l arch=* use strict; use utf8; # not really needed for now, but for the sake of consistency use Time::HiRes 'time'; use POSIX 'strftime'; use DBI; use LWP::UserAgent; use XML::Simple; use Digest::MD5 'md5_hex'; use Data::Dumper 'Dumper'; use URI; use Getopt::Long; use constant TIMEOUT => 1200; # stop db retries after this many secs # Command line parsing (currently used only for debug flag): my $debug; GetOptions('debug' => \$debug) or die "Usage: $0 [--debug]\n"; # Configuration and general setup: # TODO: read these from the env or command line my $dbname = "commonswiki"; my $server = "commons.wikimedia.org"; my $botname = "MIMEStatBot"; my $pwfile = "mimestatbot.pass"; my $dumpfile = "public_html/stats/commons_mime_statistics_data.txt"; my $sitename = "Commons"; my $pageprefix = ($debug ? "User:$botname/test/" : "$sitename:"); my $statpage = $pageprefix . "MIME type statistics"; my $listpage = "$statpage/Unusual types"; my $interval = ($debug ? "test" : "weekly"); my $list_max = 500; my @now = gmtime; my $date = strftime "%Y-%m-%d", @now; my $time = strftime "%H:%M:%S", @now; my ($t0, $t1, $dt, $sql, $rows); # common vars for queries and timeouts my $botsummary = "bot updating $interval statistics at $date $time"; my $testcond = ($debug ? strftime("img_timestamp >= '%Y%m%d%H%M%S'", gmtime(time-7*24*60*60)) : "1=1"); # Start script: warn "\n--- Generating $interval MIME statistics for $dbname at $date $time (UTC) ---\n"; # Knowing the system we're running on should help debugging: system qw(uname -a); $| = 1; # SGE should start us in home dir anyway, but better safe than sorry... chdir or die "Error changing to home directory: $!\n"; # Connect to database: my $data_source = "DBI:mysql:database=${dbname}_p;host=${dbname}-p.rrdb.toolserver.org;mysql_read_default_group=client"; my $dbh; $t0 = time; until ($dbh = DBI->connect($data_source)) { warn "SQL connect failed: $DBI::errstr\n"; my $sleep = time - $t0 + 1; die "Giving up after $sleep seconds.\n" if $sleep > TIMEOUT; warn "Waiting $sleep seconds before retry...\n"; sleep $sleep; } $dbh->{RaiseError} = 1; # all DB errors should be fatal $dbh->do("SET SESSION TRANSACTION ISOLATION LEVEL READ UNCOMMITTED"); # Set up LWP: my $ua = LWP::UserAgent->new( agent => "Mozilla/4.0 (compatible; $0)", from => 'vyznev@toolserver.org', cookie_jar => {}, parse_head => 0, ); # Generic MediaWiki API request handler: my $apiURI = "http://$server/w/api.php"; sub apireq { my $query = [format => 'xml', @_]; my $sleep = 5; if ($URI::VERSION < 1.36) { # Handling of Unicode strings changed in URI.pm v1.36, which $ua->post() calls internally utf8::encode($_) for @$query; } while (1) { my $res = $ua->post($apiURI, $query); my $err = $res->header('MediaWiki-API-Error') || ""; return XMLin( $res->content ) if $res->is_success and $err ne 'maxlag'; print STDERR "API request failed, ", ($err || $res->status_line), "..."; if ($sleep > 3*60*60) { warn "giving up\n"; return XMLin( $res->content ); } warn "sleeping $sleep seconds\n"; sleep $sleep; $sleep *= 2; } } # Subroutine to post data to the wiki: sub postpage { my ($title, $content, $summary) = @_; # XXX: inputs are assumed to be Unicode strings; use utf8::decode() on data from MySQL first! my $md5 = md5_hex(do { my $x = $content; utf8::encode($x); $x }); # md5_hex() wants octets warn "Getting edit token for [[$title]]\n"; my $data = apireq( action => 'query', prop => 'info', intoken => 'edit', titles => $title, ); my $token = $data->{query}{pages}{page}{edittoken} or die "Failed to get token, got:\n", Dumper($data), "\n"; # warn "pretending to save $title ($summary):\n$content\n"; return; ## DEBUG warn "Editing [[$title]] ($summary)\n"; my $edit = eval { apireq( maxlag => 5, action => 'edit', title => $title, summary => $summary, recreate => 1, md5 => $md5, text => $content, token => $token, ) }; warn $@ if $@; if (ref $edit ne 'HASH' or $edit->{error} or $edit->{edit}{result} ne 'Success') { if (ref $edit ne 'HASH') { warn "Got unexpected result:\n", Dumper($edit), "\n"; } elsif ($edit->{error}) { warn "Editing $title failed ($edit->{error}{code}): $edit->{error}{info}\n"; } elsif ($edit->{edit}{result} ne 'Success') { warn "Editing $title did not succeed ($edit->{edit}{result}):\n", Dumper($edit), "\n"; } else { warn "Qweebl zzyzx bleep blort?\n", Dumper($edit), "\n"; # should be impossible } require File::Temp; my $dump = File::Temp->new( UNLINK => 0, SUFFIX => "-$botname-dump-$date.txt", DIR => "public_html/temp" ); warn "Error detected, dumping content to ", $dump->filename, "\n"; binmode $dump, ":utf8" or warn "binmode failed: $!\n"; print $dump $content or warn "print failed: $!\n"; close $dump or warn "close failed: $!\n"; chmod 0644, $dump->filename or warn "chmod failed: $!\n"; warn "Dump complete.\n"; return 0; } warn "Page [[$title]] successfully saved.\n"; return 1; } # Run stat query: $t0 = time; warn "Starting stat query at ".gmtime($t0)."\n"; $sql = <<"END"; SELECT img_major_mime, img_minor_mime, img_media_type, COUNT(*) AS files, SUM(img_size) AS bytes FROM image WHERE $testcond GROUP BY img_major_mime, img_minor_mime, img_media_type /* SLOW_OK LIMIT:7200 */ END warn "Running query:\n$sql" if $debug; { $rows = eval { $dbh->selectall_arrayref($sql) }; if ($@) { warn "Stat query failed: $@\n"; my $sleep = time - $t0 + 1; die "Giving up after $sleep seconds.\n" if $sleep > TIMEOUT; warn "Waiting $sleep seconds before retry...\n"; sleep $sleep; redo; } } $t1 = time; $dt = sprintf "%.1f", $t1 - $t0; warn "Stat query done in $dt seconds at ".gmtime($t1)."\n"; # Dump stats to file: if (open my $dump, ">>", $dumpfile) { if ($debug) { warn "Debug mode, not actually writing ".@$rows." rows to $dumpfile\n"; } else { print $dump "\n$date\n"; for my $row (@$rows) { my ($major, $minor, $media, $files, $bytes) = @$row; print $dump "$major/$minor\t$media\t$files\t$bytes\n"; } warn @$rows." rows dumped to $dumpfile\n"; } close $dump or warn "ERROR writing to $dumpfile: $!\n"; } else { warn "ERROR opening $dumpfile for append: $!\n"; } # Generate stat page: my $stat_text = <<"END"; This page is updated $interval by [[User:$botname|]]. Any other edits made to this page will be lost on next update. '''Files on $sitename by [[w:MIME type|]] as of $date $time (UTC)''' See also: [[Commons:Project scope/Allowable file types]] {| class="wikitable sortable" ! MIME type !! Media type !! Files !! Bytes END my $total_f = 0; my $total_b = 0; my %rare_types; my $total_rare = 0; for my $row (@$rows) { my ($major, $minor, $media, $files, $bytes) = @$row; $total_f += $files; $total_b += $bytes; my $type = "$major/$minor || $media"; if ($files <= $list_max) { $type = "[[$listpage#$major/$minor ($media)|$major/$minor]] || $media"; $_ = $dbh->quote($_) for $major, $minor, $media; $rare_types{$media}{$major}{$minor}++; $total_rare += $files; } $_ = reverse($_), s/(\d{3})\B/$1,/g, $_ = reverse($_) for $files, $bytes; utf8::decode($type); # should not be necessary, but let's be consistent $stat_text .= <<"END"; |- | $type | align="right" | $files | align="right" | $bytes END } $_ = reverse($_), s/(\d{3})\B/$1,/g, $_ = reverse($_) for $total_f, $total_b; $stat_text .= <<"END"; |- class="sortbottom" ! style="border-right:0" | '''Total''' ! style="border-left:0" |   | align="right" | $total_f | align="right" | $total_b |} [[Category:Commons statistics|MIME type statistics]] END # Run list query: my @cond; for my $q_media (sort keys %rare_types) { my $major_types = $rare_types{$q_media}; for my $q_major (sort keys %$major_types) { my $minor_types = $major_types->{$q_major}; my $q_minor = join ", ", sort keys %$minor_types; push @cond, "(img_major_mime = $q_major AND img_media_type = $q_media AND img_minor_mime IN ($q_minor))"; } } my $cond = join "\n OR ", @cond; $t0 = time; warn "Starting list query at ".gmtime($t0).", expecting $total_rare rows\n"; $sql = <<"END"; SELECT img_major_mime, img_minor_mime, img_media_type, img_name FROM image WHERE ($cond) AND ($testcond) /* SLOW_OK LIMIT:7200 */ END warn "Running query:\n$sql" if $debug; { $rows = eval { $dbh->selectall_arrayref($sql) }; if ($@) { warn "List query failed: $@\n"; my $sleep = time - $t0 + 1; die "Giving up after $sleep seconds.\n" if $sleep > TIMEOUT; warn "Waiting $sleep seconds before retry...\n"; sleep $sleep; redo; } } $t1 = time; $dt = sprintf "%.1f", $t1 - $t0; warn "List query done in $dt seconds at ".gmtime($t1)."\n"; my %list; for my $row (@$rows) { my ($major, $minor, $media, $title) = @$row; $title =~ tr/_/ /; push @{ $list{"$major/$minor ($media)"} }, $title; } # Generate list page: my $list_text = <<"END"; This page is updated $interval by [[User:$botname|]]. Any other edits made to this page will be lost on next update. This page lists all files on $sitename for [[w:MIME type|]]s that have less than $list_max files each, as of $date $time (UTC). For a list of currently permitted types, see [[Commons:Project scope/Allowable file types]]. END for my $type (sort keys %list) { my $heading = $type; utf8::decode($heading); # should not be necessary, but let's be consistent $list_text .= "\n== $heading ==\n\n"; for my $title (sort @{ $list{$type} }) { utf8::decode($title); $title =~ s/([&<>\[\]{}|\x27])/sprintf "&#%d;", ord $1/eg; # \x27 = single quote $title =~ s/\xA0/ /g; $list_text .= "* [[:File:$title|$title]]\n"; } } $list_text .= "\n\n"; # Disconnect from database: $dbh->disconnect; # Log in to wiki: open PW, "<", $pwfile or die "Error opening $pwfile: $!"; my $botpass = ; chomp $botpass; close PW or warn "Error reading $pwfile: $!\n"; warn "Logging in to $server as $botname, will post to [[$statpage]]\n"; my $login = apireq( action => 'login', lgname => $botname, lgpassword => $botpass ); $login = apireq( action => 'login', lgname => $botname, lgpassword => $botpass, lgtoken => $login->{login}{token} ) if ($login->{login}{result} || '') eq 'NeedToken'; $login->{error} and die "Login as $botname failed ($login->{error}{code}): $login->{error}{info}\n"; $login->{login}{result} eq 'Success' or die "Login as $botname failed: $login->{login}{result}\n"; # Post pages to wiki: postpage($statpage, $stat_text, $botsummary); postpage($listpage, $list_text, $botsummary); warn "All done, exiting.\n"; __END__