#!/usr/bin/perl
# License: [[Public domain]]
# remove this if you download the code to another server
sub STOP_WORKING {0}
# regression test cases:
# Markadet fr.wikipedia.org (11k edits)
# Kolja21 de.wikipedia.org (5.1k edits)
# OldakQuill en.wikipedia.org (12k edits)
# Mxn meta.wikimedia.org (1.7k edits)
# Helios89 it.wikipedia.org (7k edits)
# TODO:
# - regarding the "403 access denied" problem, contact noc@wikimedia.org or #wikimedia-tech on freenode
# - ahh, they actively block screen-scrapers
# - sweet-talk Hashar or Dom into unblocking, temporarily disable the tool or enable some form of rate limiting, etc.
# - add a starting-cutoff-date, so renominations for RfA could only include the most recent items
# - add a # edits per day
# - use something like this to retrieve the list of namespaces in real-time:
# http://en.wikiquote.org/wiki/Special:Export/Main_Page
# - make "minor" actually work well for editcountitis:
# - eg. for each namespace, present it like: Category Talk: 23 (13)
# where "23" is the non-minor edits, and "13" is the minor edits
# - get it to work with other mediawikis (example: http://brandt-watch.org/bwwiki/Main_Page)
# - include a date at the end of the top-15 breakdown
# - change the
s to
s on graph bars
# - don't count comments as having an edit summary when it's purely an autocomment
# - fix the issue where there's an "extra" first result when $offset > 0
#
# - REWRITE IN AJAX so we don't have to worry about it being a temporary solution or not
# - fix the sorting order on the output
#
# - ?? http://tools.wikimedia.de/~avar/cgi-bin/count
# Possible other analysis graphs:
# - monthly breakdowns
# : have all the monthly breakdowns appear in one space on the page, but allow the user to
# select between them with Javascript
# - monthly breakdown of major/minor edits (like current red/green... make major edits on left, with minor edits trailing on right)
# - monthly breakdown of the number of edits with summaries of /^(rv|revert)/
# - monthly breakdown, one each for the separate namespaces
# - on monthly breakdowns, extrapolate the current month forward
# - allow the user to hit ''(more)'' at the bottom of the namespace breakdowns, allowing them to
# see a more complete list of top-15
# - allow the user to restrict the metrics to some specific recent period... eg. this is
# something that's sometimes discussed on RfA
# - any content-based analyses? (I suppose one would have to know which SQL thingies are quicker than others)
# semi-far-out:
# - allow the user to see JUST their edits from a specific page, when they click on that page on
# the top-15 breakdown (furthermore, if structured right, it might let anybody's tool basically to
# pop up the results of a $user && $page query)
# - allow the results to be the combination of multiple users (either logged-in-user + anon-IP,
# and multiple logged-in-users from multiple sites, eg. meta)
use strict;
use warnings;
use CGI;
use CGI::Carp qw(fatalsToBrowser);
use Date::Parse;
use LWP::Simple;
use HTML::Entities;
use Data::Dumper;
sub LOGFILE {"/home/interiot/public_html/tmp/wannabe_kate.log"}
if ($ENV{QUERY_STRING} eq "code") { # send ourself when requested
open FIN, $0 and print "Content-type: text/plain\n\n", ;
exit;
}
# fill out using these documents:
# http://meta.wikimedia.org/wiki/MediaWiki_localisation#Getting_latest_file
# http://sourceforge.net/docs/E04/
sub nmspc {
my @a = map {s/#.*//; s/^\s+|\s+$//g; $_} grep /\S/, split /[\n\r]+/, shift;
return { "\x00order" => [@a], map { $_,1} @a};
}
my %valid_namespaces = (
'en.wikipedia.org' => nmspc(qq[
Talk:
Category talk:
Category:
Help:
Help talk:
Image:
Image talk:
MediaWiki:
MediaWiki talk:
Portal:
Portal talk:
Template:
Template talk:
User:
User talk:
Wikipedia:
Wikipedia talk:
]),
'de.wikipedia.org' => nmspc(qq[
Diskussion: # Talk
Kategorie: # Category:
Kategorie Diskussion: # Category Talk:
Hilfe: # Help:
Hilfe Diskussion: # Help Talk:
Bild: # Image:
Bild Diskussion: # Image Talk:
MediaWiki: # MediaWiki:
MediaWiki Diskussion: # MediaWiki Talk:
Portal: # Portal:
Portal Diskussion: # Portal Talk:
Vorlage: # Template:
Vorlage Diskussion: # Template Talk:
Benutzer: # User:
Benutzer Diskussion: # User Talk:
Wikipedia: # Wikipedia:
Wikipedia Diskussion: # Wikipedia Talk:
]),
'it.wikipedia.org' => nmspc(qq[
Discussione # Talk:
Categoria # Category:
Discussioni categoria # Category Talk:
Aiuto # Help:
Discussioni aiuto # Help Talk:
Immagine # Image:
Discussioni immagine # Image Talk:
MediaWiki # MediaWiki:
Discussioni MediaWiki # MediaWiki Talk:
Template # Template:
Discussioni template # Template Talk:
Utente # User:
Discussioni utente # User Talk:
Wikipedia # Wikipedia:
Discussioni Wikipedia # Wikipedia Talk:
]),
);
my $query = new CGI;
my $site = CGI::Util::escape($query->param("site"));
#$site = "" unless (lc($site) eq "en.wikipedia.org");
#my $username = CGI::Util::escape($query->param("username") || "");
my $username = $query->param("username") || "";
$username =~ s/[\+\s]/_/g;
my $isvalid = 0;
my $opted_in = 0;
$opted_in = 1 if (lc($site) eq "en.wikipedia.org"); # not clear that they're opted in, but German wikipedia has let me make this assumption
my $this_namespace;
if ($ENV{QUERY_STRING}) {
$isvalid = 1;
$isvalid = 0 unless ($site =~ /^[\w\.]*\.(org|com|net)$/i);
#$isvalid = 0 unless ($username =~ /^[-\w\._]*$/);
$isvalid = 0 if (length($username) == 0);
#$isvalid = 0 if ($username =~ /[<>\[\&]/);
$isvalid = 0 if ($username =~ /[<>\[]/);
}
my $print_username = CGI::escapeHTML($username);
$isvalid = 0 if (STOP_WORKING);
# data we generate by parsing the output from Wikipedia
my @urls;
my $bandwidth_down = 0;
my %namespace_totals;
my $xml_lang = "";
my $earliest_perldate;
my $latest_perldate;
my %month_totals;
my %month_editsummary_totals;
my %unique_articles;
my %namespace_unique_articles;
my %article_titles;
print "Content-type: text/html; charset=utf-8\n\n";
#cgi_dumper(\%valid_namespaces);
if (!$isvalid) {
if ($ENV{QUERY_STRING}) {
if (STOP_WORKING) {
print "Script not functional here. If you do want to run it, please download the code and run on your own private server, but note that scraping, especially on scripts that are publically available, may get your IP banned. See #wikimedia-tech.
\n";
} else {
print "Invalid value. email Interiot if this is incorrect.
\n";
}
}
print <<"EOF";
This is a slow substitute for Kate's Tool when it's unavailable.
Notes:
For bug reports/comments, see User talk:Interiot or email him.
EOF
} else {
$this_namespace = $valid_namespaces{lc $site};
#cgi_dumper(\$this_namespace); exit;
$username =~ s/^_+|_$//g;
#print "$site $username\n";
$namespace_totals{earliest} = get_5000($site, $username, 0);
#cgi_dumper(\@urls, \%namespace_totals); exit;
#cgi_dumper(\%unique_articles);
$namespace_totals{"number of unique pages"} = scalar(keys %unique_articles);
$namespace_totals{"avg edits per page"} = sprintf("%5.2f", $namespace_totals{total} / $namespace_totals{"number of unique pages"})
if ($namespace_totals{"number of unique pages"});
print $xml_lang, <<'EOF';
EOF
print "Go back to see caveats or to check another user. \n";
print "User:$print_username\n";
print "run at ", scalar(gmtime), " GMT \n";
print "\n";
foreach my $key (sort keys %namespace_totals) {
print "| ", $key, " | ", $namespace_totals{$key}, "\n";
}
print " | \n";
#### output the months stats
#cgi_dumper(\%month_editsummary_totals);
my @months = list_months();
#cgi_dumper([scalar(gmtime($latest_perldate)), @months]);
my $max_width = 0;
$max_width = ($_ > $max_width ? $_ : $max_width) foreach (values %month_totals);
if ($max_width > 0 && $opted_in) {
print "\n";
foreach my $month (@months) {
$month_editsummary_totals{$month} ||= 0; # fix STDERR
$month_totals{$month} ||= 0;
my $no_summary = $month_totals{$month} - $month_editsummary_totals{$month};
print "| $month | ", $month_totals{$month}, "\n";
#print " | \n";
print " | \n";
print "\n";
}
print " | \n";
}
print "(green denotes edits with an edit summary (even an automatic one), red denotes edits without an edit summary) \n";
#### output the top-15 namespace stats
my $num_to_present = 15;
if ($this_namespace && $opted_in) { # only do it if we're sure about the namespaces
print " \n";
#print "
NOTE: This section has a tendency to hilight a user's \"youthful indiscretions\". Please take the dates of the edits into account. \n";
foreach my $nmspc ("Mainspace", @{$this_namespace->{"\x00order"}}) {
next unless (scalar(keys %{$namespace_unique_articles{$nmspc}}));
my @articles = sort {$namespace_unique_articles{$nmspc}{$b} <=> $namespace_unique_articles{$nmspc}{$a}}
grep { $namespace_unique_articles{$nmspc}{$_} > 1} # filter out items with only 1 edit
keys(%{$namespace_unique_articles{$nmspc}});
next unless (@articles);
#print "\n";
print " | $nmspc\n";
my @present = splice(@articles, 0, $num_to_present);
foreach my $article (@present) {
my $artname = $article_titles{$article};
if ($nmspc ne 'Mainspace') {
$artname =~ s/^.*?://;
}
$artname =~ s/\s/ /g;
my $url = "http://$site/w/index.php?title=$article&action=history";
print " |
|---|
| ", $namespace_unique_articles{$nmspc}{$article}, " | $artname\n";
}
# fill it out so float:left doesn't jumble up
for (my $ctr=@present; $ctr<15; $ctr++) {
print " | | | \n";
}
print " |
\n";
#print " \n";
}
}
#### output the bottom summary
print "
If there were any problems, please email Interiot or post at User talk:Interiot.\n";
#print "
Based on these URLs:\n \n", join("\n", map {"- Based directly on these URLs:\n";
for (my $ctr=0; $ctr<@urls; $ctr++) {
print "[", ($ctr+1), "]";
print ", " unless ($ctr >= @urls - 1);
print "\n";
}
print "\n";
#### log the bandwidth used
open FOUT, ">>" . LOGFILE() or die;
printf FOUT "%s %-20s %-30s %5dK %7d\n", scalar(localtime), $username, $site,
int($bandwidth_down / 1024), $namespace_totals{total};
close FOUT;
}
# fetches one page of Special:Contributions, with 5000 entries on it,
sub get_5000 {
my $site = shift;
my $username = shift;
my $offset = shift;
my $skip_oldid = shift || -1;
my $earliest = "";
# for debugging purposes only
my $limit = int($query->param("limit") || 5000);
$limit = 50 if ($limit < 50);
my $url = "http://$site/w/index.php?title=Special:Contributions&target=" . CGI::Util::escape($username) . "&offset=${offset}&limit=$limit&uselang=en";
$url =~ s/&offset=0&/&/; # if the offset is 0, just remove it, because Special:Contributions just doesn't like this anymore
if (! $LWP::Simple::ua) {
LWP::Simple::_init_ua();
$LWP::Simple::ua->agent("Interiot's Tool1; [[Special:Emailuser/Interiot]]");
}
push(@urls, $url);
if (@urls >= 10) {
print "Too many pages fetched. Terminating.
\n";
#cgi_dumper(\@urls); exit;
return $earliest;
}
my $page;
if (1) {
my $request = HTTP::Request->new(GET => $url);
my $response = $LWP::Simple::ua->request($request);
if (!$response->is_success) {
print "While trying to fetch $url, $site responded:
\n", $response->status_line, "
", $response->content;
exit;
}
$page = $response->content;
$bandwidth_down += length($page);
if (0) {
local *FOUTOUT;
open FOUTOUT, ">/var/tmp/kate/tmp.out" or die;
print FOUTOUT $page;
close FOUTOUT;
}
} else {
open FININ, ";
close FININ;
}
if ($page =~ m#(]+>)#i) {
$xml_lang = $1;
}
## parse each individual contribution
#while ($page =~ /^ - (\d\d:\d\d,.*)/igm) {
my $last_oldid;
while ($page =~ /^
- ([^(]+\([^<]*\s*\(#is);
$edit_summary++ if (/
|