#!/usr/bin/perl -w use strict; use XML::Parser; $| = 1; my ($title, @text); my $n = 0; my $t = my $t0 = time; my $len = 0; sub handle_start { my ($expat, $tag) = @_; if ($tag eq 'page') { $title = ""; @text = (); } elsif ($tag eq 'title') { $expat->setHandlers(Char => \&handle_char_title); } elsif ($tag eq 'text') { $expat->setHandlers(Char => \&handle_char_text); } } sub handle_char_title { $title .= $_[1]; } sub handle_char_text { push @text, $_[1] if $_[1] =~ /\{\{\s*(ns:|(FULL|TALK|SUBJECT|ARTICLE|NAME)(PAGENAME|SPACE)E\s*\}\})/; } sub handle_end { my ($expat, $tag) = @_; if ($tag eq 'page') { $n++; if (time >= $t+1) { my $dt = time - $t0; my $b = $expat->current_byte; my $msg = sprintf "Processed %d pages (%.2f/s) = %d bytes (%.2f/s) in %d sec...", $n, $n/$dt, $b, $b/$dt, $dt; $len = length $msg; print STDERR "\r", $msg; $t = time; } # my $text = join "", @text; # warn "\r[[$title]]: \"", substr($text, 0, 40), "...\"\n"; sleep 1; if (grep(/\{\{\s*ns:/, @text) and grep(/\{\{\s*(FULL|TALK|SUBJECT|ARTICLE|NAME)(PAGENAME|SPACE)E\s*\}\}/, @text)) { print STDERR "\r", " " x $len, "\r" if $len; $len = 0; print $title, "\n"; } # elsif (grep(/^\#REDIRECT\b/i, @text)) { warn "[[$title]] is a redirect!", " " x 40, "\n"; } } elsif ($tag eq 'title' or $tag eq 'text') { $expat->setHandlers(Char => undef); } } my $parser = XML::Parser->new(ProtocolEncoding => 'UTF-8', Handlers => {Start => \&handle_start, End => \&handle_end}); binmode STDIN, ':utf8'; binmode STDOUT, ':utf8'; binmode STDERR, ':utf8'; $parser->parse(\*STDIN); warn "\n"; __END__ Wikipedia http://en.wikipedia.org/wiki/Main_Page MediaWiki 1.14alpha first-letter Media Special Talk User User talk Wikipedia Wikipedia talk Image Image talk MediaWiki MediaWiki talk Template Template talk Help Help talk Category Category talk Portal Portal talk AmericanSamoa 6 133452270 2007-05-25T17:12:06Z Gurch 241822 Revert edit(s) by [[Special:Contributions/Ngaiklin|Ngaiklin]] to last version by [[Special:Contributions/Docu|Docu]] #REDIRECT [[American Samoa]]{{R from CamelCase}}