#!/usr/bin/perl # License: [[Public domain]] # remove this if you download the code to another server sub STOP_WORKING {0} # regression test cases: # Markadet fr.wikipedia.org (11k edits) # Kolja21 de.wikipedia.org (5.1k edits) # OldakQuill en.wikipedia.org (12k edits) # Mxn meta.wikimedia.org (1.7k edits) # Helios89 it.wikipedia.org (7k edits) # TODO: # - regarding the "403 access denied" problem, contact noc@wikimedia.org or #wikimedia-tech on freenode # - ahh, they actively block screen-scrapers # - sweet-talk Hashar or Dom into unblocking, temporarily disable the tool or enable some form of rate limiting, etc. # - add a starting-cutoff-date, so renominations for RfA could only include the most recent items # - add a # edits per day # - use something like this to retrieve the list of namespaces in real-time: # http://en.wikiquote.org/wiki/Special:Export/Main_Page # - make "minor" actually work well for editcountitis: # - eg. for each namespace, present it like: Category Talk: 23 (13) # where "23" is the non-minor edits, and "13" is the minor edits # - get it to work with other mediawikis (example: http://brandt-watch.org/bwwiki/Main_Page) # - include a date at the end of the top-15 breakdown # - change the
s to s on graph bars # - don't count comments as having an edit summary when it's purely an autocomment # - fix the issue where there's an "extra" first result when $offset > 0 # # - REWRITE IN AJAX so we don't have to worry about it being a temporary solution or not # - fix the sorting order on the output # # - ?? http://tools.wikimedia.de/~avar/cgi-bin/count # Possible other analysis graphs: # - monthly breakdowns # : have all the monthly breakdowns appear in one space on the page, but allow the user to # select between them with Javascript # - monthly breakdown of major/minor edits (like current red/green... make major edits on left, with minor edits trailing on right) # - monthly breakdown of the number of edits with summaries of /^(rv|revert)/ # - monthly breakdown, one each for the separate namespaces # - on monthly breakdowns, extrapolate the current month forward # - allow the user to hit ''(more)'' at the bottom of the namespace breakdowns, allowing them to # see a more complete list of top-15 # - allow the user to restrict the metrics to some specific recent period... eg. this is # something that's sometimes discussed on RfA # - any content-based analyses? (I suppose one would have to know which SQL thingies are quicker than others) # semi-far-out: # - allow the user to see JUST their edits from a specific page, when they click on that page on # the top-15 breakdown (furthermore, if structured right, it might let anybody's tool basically to # pop up the results of a $user && $page query) # - allow the results to be the combination of multiple users (either logged-in-user + anon-IP, # and multiple logged-in-users from multiple sites, eg. meta) use strict; use warnings; use CGI; use CGI::Carp qw(fatalsToBrowser); use Date::Parse; use LWP::Simple; use HTML::Entities; use Data::Dumper; sub LOGFILE {"/home/interiot/public_html/tmp/wannabe_kate.log"} if ($ENV{QUERY_STRING} eq "code") { # send ourself when requested open FIN, $0 and print "Content-type: text/plain\n\n", ; exit; } # fill out using these documents: # http://meta.wikimedia.org/wiki/MediaWiki_localisation#Getting_latest_file # http://sourceforge.net/docs/E04/ sub nmspc { my @a = map {s/#.*//; s/^\s+|\s+$//g; $_} grep /\S/, split /[\n\r]+/, shift; return { "\x00order" => [@a], map { $_,1} @a}; } my %valid_namespaces = ( 'en.wikipedia.org' => nmspc(qq[ Talk: Category talk: Category: Help: Help talk: Image: Image talk: MediaWiki: MediaWiki talk: Portal: Portal talk: Template: Template talk: User: User talk: Wikipedia: Wikipedia talk: ]), 'de.wikipedia.org' => nmspc(qq[ Diskussion: # Talk Kategorie: # Category: Kategorie Diskussion: # Category Talk: Hilfe: # Help: Hilfe Diskussion: # Help Talk: Bild: # Image: Bild Diskussion: # Image Talk: MediaWiki: # MediaWiki: MediaWiki Diskussion: # MediaWiki Talk: Portal: # Portal: Portal Diskussion: # Portal Talk: Vorlage: # Template: Vorlage Diskussion: # Template Talk: Benutzer: # User: Benutzer Diskussion: # User Talk: Wikipedia: # Wikipedia: Wikipedia Diskussion: # Wikipedia Talk: ]), 'it.wikipedia.org' => nmspc(qq[ Discussione # Talk: Categoria # Category: Discussioni categoria # Category Talk: Aiuto # Help: Discussioni aiuto # Help Talk: Immagine # Image: Discussioni immagine # Image Talk: MediaWiki # MediaWiki: Discussioni MediaWiki # MediaWiki Talk: Template # Template: Discussioni template # Template Talk: Utente # User: Discussioni utente # User Talk: Wikipedia # Wikipedia: Discussioni Wikipedia # Wikipedia Talk: ]), ); my $query = new CGI; my $site = CGI::Util::escape($query->param("site")); #$site = "" unless (lc($site) eq "en.wikipedia.org"); #my $username = CGI::Util::escape($query->param("username") || ""); my $username = $query->param("username") || ""; $username =~ s/[\+\s]/_/g; my $isvalid = 0; my $opted_in = 0; $opted_in = 1 if (lc($site) eq "en.wikipedia.org"); # not clear that they're opted in, but German wikipedia has let me make this assumption my $this_namespace; if ($ENV{QUERY_STRING}) { $isvalid = 1; $isvalid = 0 unless ($site =~ /^[\w\.]*\.(org|com|net)$/i); #$isvalid = 0 unless ($username =~ /^[-\w\._]*$/); $isvalid = 0 if (length($username) == 0); #$isvalid = 0 if ($username =~ /[<>\[\&]/); $isvalid = 0 if ($username =~ /[<>\[]/); } my $print_username = CGI::escapeHTML($username); $isvalid = 0 if (STOP_WORKING); # data we generate by parsing the output from Wikipedia my @urls; my $bandwidth_down = 0; my %namespace_totals; my $xml_lang = ""; my $earliest_perldate; my $latest_perldate; my %month_totals; my %month_editsummary_totals; my %unique_articles; my %namespace_unique_articles; my %article_titles; print "Content-type: text/html; charset=utf-8\n\n"; #cgi_dumper(\%valid_namespaces); if (!$isvalid) { if ($ENV{QUERY_STRING}) { if (STOP_WORKING) { print "Script not functional here. If you do want to run it, please download the code and run on your own private server, but note that scraping, especially on scripts that are publically available, may get your IP banned. See #wikimedia-tech.



\n"; } else { print "Invalid value. email Interiot if this is incorrect.



\n"; } } print <<"EOF"; This is a slow substitute for Kate's Tool when it's unavailable.

username
site
Notes: For bug reports/comments, see User talk:Interiot or email him. EOF } else { $this_namespace = $valid_namespaces{lc $site}; #cgi_dumper(\$this_namespace); exit; $username =~ s/^_+|_$//g; #print "$site
$username\n"; $namespace_totals{earliest} = get_5000($site, $username, 0); #cgi_dumper(\@urls, \%namespace_totals); exit; #cgi_dumper(\%unique_articles); $namespace_totals{"number of unique pages"} = scalar(keys %unique_articles); $namespace_totals{"avg edits per page"} = sprintf("%5.2f", $namespace_totals{total} / $namespace_totals{"number of unique pages"}) if ($namespace_totals{"number of unique pages"}); print $xml_lang, <<'EOF'; EOF print "\n"; print "

User:$print_username

\n"; print "\n"; print "\n"; foreach my $key (sort keys %namespace_totals) { print "
", $key, "", $namespace_totals{$key}, "\n"; } print "
\n"; #### output the months stats #cgi_dumper(\%month_editsummary_totals); my @months = list_months(); #cgi_dumper([scalar(gmtime($latest_perldate)), @months]); my $max_width = 0; $max_width = ($_ > $max_width ? $_ : $max_width) foreach (values %month_totals); if ($max_width > 0 && $opted_in) { print "\n"; foreach my $month (@months) { $month_editsummary_totals{$month} ||= 0; # fix STDERR $month_totals{$month} ||= 0; my $no_summary = $month_totals{$month} - $month_editsummary_totals{$month}; print "
$month ", $month_totals{$month}, "\n"; #print "
\n"; print "
\n"; print "
\n"; } print "
\n"; } print "
(green denotes edits with an edit summary (even an automatic one), red denotes edits without an edit summary)
\n"; #### output the top-15 namespace stats my $num_to_present = 15; if ($this_namespace && $opted_in) { # only do it if we're sure about the namespaces print "


\n"; #print "

\n"; foreach my $nmspc ("Mainspace", @{$this_namespace->{"\x00order"}}) { next unless (scalar(keys %{$namespace_unique_articles{$nmspc}})); my @articles = sort {$namespace_unique_articles{$nmspc}{$b} <=> $namespace_unique_articles{$nmspc}{$a}} grep { $namespace_unique_articles{$nmspc}{$_} > 1} # filter out items with only 1 edit keys(%{$namespace_unique_articles{$nmspc}}); next unless (@articles); #print "
\n"; print "
$nmspc\n"; my @present = splice(@articles, 0, $num_to_present); foreach my $article (@present) { my $artname = $article_titles{$article}; if ($nmspc ne 'Mainspace') { $artname =~ s/^.*?://; } $artname =~ s/\s/ /g; my $url = "http://$site/w/index.php?title=$article&action=history"; print "
", $namespace_unique_articles{$nmspc}{$article}, "$artname\n"; } # fill it out so float:left doesn't jumble up for (my $ctr=@present; $ctr<15; $ctr++) { print "
  \n"; } print "
\n"; #print "
\n"; } } #### output the bottom summary print "



If there were any problems, please email Interiot or post at User talk:Interiot.\n"; #print "

Based on these URLs:\n

\n"; #### log the bandwidth used open FOUT, ">>" . LOGFILE() or die; printf FOUT "%s %-20s %-30s %5dK %7d\n", scalar(localtime), $username, $site, int($bandwidth_down / 1024), $namespace_totals{total}; close FOUT; } # fetches one page of Special:Contributions, with 5000 entries on it, sub get_5000 { my $site = shift; my $username = shift; my $offset = shift; my $skip_oldid = shift || -1; my $earliest = ""; # for debugging purposes only my $limit = int($query->param("limit") || 5000); $limit = 50 if ($limit < 50); my $url = "http://$site/w/index.php?title=Special:Contributions&target=" . CGI::Util::escape($username) . "&offset=${offset}&limit=$limit&uselang=en"; $url =~ s/&offset=0&/&/; # if the offset is 0, just remove it, because Special:Contributions just doesn't like this anymore if (! $LWP::Simple::ua) { LWP::Simple::_init_ua(); $LWP::Simple::ua->agent("Interiot's Tool1; [[Special:Emailuser/Interiot]]"); } push(@urls, $url); if (@urls >= 10) { print "Too many pages fetched. Terminating.
\n"; #cgi_dumper(\@urls); exit; return $earliest; } my $page; if (1) { my $request = HTTP::Request->new(GET => $url); my $response = $LWP::Simple::ua->request($request); if (!$response->is_success) { print "While trying to fetch $url, $site responded:

\n", $response->status_line, "

", $response->content; exit; } $page = $response->content; $bandwidth_down += length($page); if (0) { local *FOUTOUT; open FOUTOUT, ">/var/tmp/kate/tmp.out" or die; print FOUTOUT $page; close FOUTOUT; } } else { open FININ, "; close FININ; } if ($page =~ m#(]+>)#i) { $xml_lang = $1; } ## parse each individual contribution #while ($page =~ /^
  • (\d\d:\d\d,.*)/igm) { my $last_oldid; while ($page =~ /^
  • ([^(]+\([^<]*\s*\(#is); $edit_summary++ if (//si); my $article_url; if (m#([^<]+)#si) { $article_url = $1; $article_titles{$1} = $2; } $unique_articles{$article_url}++; ## strip out all the HTML tags s/<[^>]*>//gs; if (/^(.*?) \(/) { my $date = $1; $earliest = $date; # translate months into english, so Date::Parse chn handle them # languages believed to work here: EN, DE, IT $date =~ s/\b(?:gen )\b/jan/gix; $date =~ s/\b(?:mär )\b/mar/gix; $date =~ s/\b(?:mai|mag )\b/may/gix; $date =~ s/\b(?:giu )\b/jun/gix; $date =~ s/\b(?:lug )\b/jul/gix; $date =~ s/\b(?:ago )\b/aug/gix; $date =~ s/\b(?:set )\b/sep/gix; $date =~ s/\b(?:okt|ott )\b/oct/gix; $date =~ s/\b(?:dez|dic )\b/dec/gix; $this_time = str2time($date); if ($this_time == 0) { #print "XXXXXXXXXXXXXXXXXXXXXXXXX
    \n"; } else { #print scalar(gmtime($this_time)), "
    \n"; $earliest_perldate = $this_time; # record the earliest and latest month we see $latest_perldate ||= $this_time; my $monthkey = monthkey(localtime($this_time)); $month_totals{$monthkey}++; $edit_summary && $month_editsummary_totals{$monthkey}++; } } s/^[^()]*\([^()]*\) \([^()]*\) (?:\S )? //; my $subspace = "Mainspace"; if (/^([^\s\d\/:]+(?:\s[^\s\d\/:]+)?:)/) { if (!$this_namespace || exists $this_namespace->{$1}) { $subspace = $1; } } $namespace_totals{$subspace}++; $namespace_totals{total}++; $namespace_unique_articles{$subspace}{$article_url}++; #print "$_
    \n"; } ## if they have more than 5000 contributions, go to the next page #while ($page =~ m#href="[^"]+:Contributions[^"]+offset=(\d+)#ig) { while ($page =~ m#href="[^"]+offset=(\d+)[^"]*"[^>]*>Older #ig) { #print "Trying again at offset $1
    \n"; next unless ($1 > 0 && ($offset == 0 || $1 < $offset)); return get_5000($site, $username, $1, $last_oldid); # tail recursion until there are no more } return $earliest; } # returns something like [ # "2003/10", # "2003/11", # "2003,12" # ] sub list_months { my $last_monthkey = ''; my @ret; # yes, this is a fairly odd algorithm. oh well. for (my $date=$earliest_perldate; $date<=$latest_perldate; $date+=10*24*60*60) { my $monthkey = monthkey(localtime($date)); if ($monthkey ne $last_monthkey) { push(@ret, $monthkey); $last_monthkey = $monthkey; } } my $monthkey = monthkey(localtime($latest_perldate)); if ($monthkey ne $last_monthkey) { push(@ret, $monthkey); $last_monthkey = $monthkey; } return @ret; } sub monthkey {($_[5] + 1900) . "/" . ($_[4] + 1)} sub cgi_dumper {print "
    ", HTML::Entities::encode(Dumper(@_)), "
    "}