#!/usr/bin/perl -w use strict; use Time::HiRes 'time'; use POSIX 'strftime'; use DBI; use LWP::UserAgent; use XML::Simple; use Digest::MD5 'md5_hex'; use Data::Dumper 'Dumper'; use constant DEBUG => ($0 =~ /_test\b/); # Configuration and general setup: # TODO: read these from the env or command line my $dbname = "commonswiki"; my $server = "commons.wikimedia.org"; my $botname = "MIMEStatBot"; my $pwfile = "/home/vyznev/mimestatbot.pass"; my $sitename = "Commons"; my $pageprefix = (DEBUG ? "User:$botname/test/" : "$sitename:"); my $statpage = $pageprefix . "MIME type statistics"; my $listpage = "$statpage/Unusual types"; my $interval = (DEBUG ? "test" : "weekly"); my $list_max = 500; my @now = gmtime; my $date = strftime "%Y-%m-%d", @now; my $time = strftime "%H:%M:%S", @now; my $botsummary = "bot updating $interval statistics at $date $time"; my $testcond = (DEBUG ? strftime("img_timestamp >= '%Y%m%d%H%M%S'", gmtime(time-7*24*60*60)) : "1=1"); # Connect to database: warn "\n--- Generating $interval MIME statistics for $dbname at $date $time (UTC) ---\n"; my $data_source = "DBI:mysql:database=${dbname}_p;host=${dbname}-p.db.toolserver.org;mysql_read_default_group=client"; my $dbh = DBI->connect($data_source) or die "SQL connect failed: $DBI::errstr\n"; defined $dbh->do("SET SESSION TRANSACTION ISOLATION LEVEL READ UNCOMMITTED") or die $dbh->errstr; # Connect to wiki: my $ua = LWP::UserAgent->new( agent => "Mozilla/4.0 (compatible; $0)", from => 'vyznev@toolserver.org', cookie_jar => {}, parse_head => 0, ); my $apiURI = "http://$server/w/api.php"; sub apireq { my $query = [format => 'xml', @_]; my $sleep = 5; while (1) { my $res = $ua->post($apiURI, $query); my $err = $res->header('MediaWiki-API-Error') || ""; return XMLin( $res->content ) if $res->is_success and $err ne 'maxlag'; print STDERR "API request failed, ", ($err || $res->status_line), "..."; if ($sleep > 3*60*60) { warn "giving up\n"; return XMLin( $res->content ); } warn "sleeping $sleep seconds\n"; sleep $sleep; $sleep *= 2; } } open PW, "<", $pwfile or die "Error opening $pwfile: $!"; my $botpass = ; chomp $botpass; close PW or warn "Error reading $pwfile: $!\n"; warn "Logging in to $server as $botname, will post to [[$statpage]]\n"; my $login = apireq( action => 'login', lgname => $botname, lgpassword => $botpass ); $login->{error} and die "Login as $botname failed ($login->{error}{code}): $login->{error}{info}\n"; $login->{login}{result} eq 'Success' or die "Login as $botname failed: $login->{login}{result}\n"; # Subroutine to post data to the wiki: sub postpage { my ($title, $content, $summary) = @_; # XXX: inputs are assumed to be UTF-8 encoded octets; use utf8::encode() on Unicode text first my $md5 = md5_hex($content); warn "Getting edit token for [[$title]]\n"; my $data = apireq( action => 'query', prop => 'info', intoken => 'edit', titles => $title, ); my $token = $data->{query}{pages}{page}{edittoken} or die "Failed to get token, got:\n", Dumper($data), "\n"; # warn "pretending to save $title ($summary):\n$content\n"; return; ## DEBUG warn "Editing [[$title]] ($summary)\n"; my $edit = eval { apireq( maxlag => 5, action => 'edit', title => $title, summary => $summary, recreate => 1, md5 => $md5, text => $content, token => $token, ) }; warn $@ if $@; if (ref $edit ne 'HASH' or $edit->{error} or $edit->{edit}{result} ne 'Success') { require File::Temp; my $dump = File::Temp->new( UNLINK => 0, SUFFIX => "-$botname-dump-$date.txt", DIR => $ENV{HOME} ); warn "Error detected, dumping content to ", $dump->filename, "\n"; print $dump $content; close $dump or warn "error: $!\n"; die "Dump complete\n"; } $edit->{error} and die "Editing $title failed ($edit->{error}{code}): $edit->{error}{info}\n"; $edit->{edit}{result} eq 'Success' or die "Editing $title failed ($edit->{edit}{result}):\n", Dumper($edit), "\n"; warn "Page [[$title]] successfully saved.\n"; } # Run stat query: my $t0 = time; warn "Starting stat query at ".gmtime($t0)."\n"; my $sql = <<"END"; SELECT img_major_mime, img_minor_mime, img_media_type, COUNT(*) AS files, SUM(img_size) AS bytes FROM image WHERE $testcond GROUP BY img_major_mime, img_minor_mime, img_media_type /* SLOW_OK */ END warn "Running query:\n$sql" if DEBUG; my $rows = $dbh->selectall_arrayref($sql) or die $dbh->errstr; my $t1 = time; my $dt = sprintf "%.1f", $t1 - $t0; warn "Stat query done in $dt seconds at ".gmtime($t1)."\n"; # Generate stat page: my $stat_text = <<"END"; This page is updated weekly by [[User:MIMEStatBot|MIMEStatBot]]. Any other edits made to this page will be lost on next update. '''Files on $sitename by [[w:MIME type|]] as of $date $time (UTC)''' See also: [[Commons:Project scope/Allowable file types]] {| class="wikitable sortable" ! MIME type !! Media type !! Files !! Bytes END my $total_f = 0; my $total_b = 0; my %rare_types; my $total_rare = 0; for my $row (@$rows) { my ($major, $minor, $media, $files, $bytes) = @$row; $total_f += $files; $total_b += $bytes; my $type = "$major/$minor || $media"; if ($files <= $list_max) { $type = "[[$listpage#$major/$minor ($media)|$major/$minor]] || $media"; $_ = $dbh->quote($_) for $major, $minor, $media; $rare_types{$media}{$major}{$minor}++; $total_rare += $files; } $_ = reverse($_), s/(\d{3})\B/$1,/g, $_ = reverse($_) for $files, $bytes; $stat_text .= <<"END"; |- | $type | align="right" | $files | align="right" | $bytes END } $_ = reverse($_), s/(\d{3})\B/$1,/g, $_ = reverse($_) for $total_f, $total_b; $stat_text .= <<"END"; |- class="sortbottom" ! colspan="2" | '''Total''' | align="right" | $total_f | align="right" | $total_b |} END postpage($statpage, $stat_text, $botsummary); # Run list query: my @cond; for my $q_media (sort keys %rare_types) { my $major_types = $rare_types{$q_media}; for my $q_major (sort keys %$major_types) { my $minor_types = $major_types->{$q_major}; my $q_minor = join ", ", sort keys %$minor_types; push @cond, "(img_major_mime = $q_major AND img_media_type = $q_media AND img_minor_mime IN ($q_minor))"; } } my $cond = join "\n OR ", @cond; $t0 = time; warn "Starting list query at ".gmtime($t0).", expecting $total_rare rows\n"; $sql = <<"END"; SELECT img_major_mime, img_minor_mime, img_media_type, img_name FROM image WHERE ($cond) AND ($testcond) /* SLOW_OK */ END warn "Running query:\n$sql" if DEBUG; $rows = $dbh->selectall_arrayref($sql) or die $dbh->errstr; $t1 = time; $dt = sprintf "%.1f", $t1 - $t0; warn "List query done in $dt seconds at ".gmtime($t1)."\n"; my %list; for my $row (@$rows) { my ($major, $minor, $media, $title) = @$row; $title =~ tr/_/ /; push @{ $list{"$major/$minor ($media)"} }, $title; } # Generate list page: my $list_text = <<"END"; This page is updated $interval by [[User:$botname|]]. Any other edits made to this page will be lost on next update. This page lists all files on $sitename for [[w:MIME type|]]s that have less than $list_max files each, as of $date $time (UTC). For a list of currently permitted types, see [[Commons:Project scope/Allowable file types]]. END for my $type (sort keys %list) { $list_text .= "\n== $type ==\n\n"; for my $title (sort @{ $list{$type} }) { $title =~ s/([&<>\[\]{}|\x27])/sprintf "&#%d;", ord $1/eg; # \x27 = single quote $title =~ s/\xC2\xA0/ /g; # the titles are UTF-8 encoded, so nbsp (\xA0) becomes \xC2\xA0 $list_text .= "* [[:Image:$title|$title]]\n"; } } $list_text .= "\n\n"; postpage($listpage, $list_text, $botsummary); # Disconnect from database: $dbh->disconnect; warn "All done, exiting.\n"; __END__