#!/usr/bin/perl -w use strict; use LWP::UserAgent; use XML::Simple; use Encode 'encode_utf8'; use Digest::MD5 'md5_hex'; use Data::Dumper 'Dumper'; use Getopt::Long 'GetOptions'; use Term::ReadKey 'ReadMode'; use Algorithm::Diff 'compact_diff'; use POSIX 'strftime'; # Default options: my $username = "Ilmari Karonen"; my $server = "commons.wikimedia.org"; my $summary; my $keep_uncat = 0; my $confirm = 0; my $nonexistent = 0; my (@add, @rem); my @namespaces; my @files; # Usage instructions: my $usage = <<"USAGE"; Usage: $0 [options] Options: -u, --user, --username User name to log in as (default: $username). -s, --server Hostname of wiki server (default: $server). -m, --summary Edit summary to use (default: list of changes made). -a, --add Add this category to all listed pages. -r, --remove Remove this category (if it's present) from all listed pages. -f, --file Read page names from the given file. -n, --namespace Localized name of the category namespace (default: Category). -c, --confirm Show a diff and ask for confirmation before each edit. -x, --noexistent Don't ask for confirmation when adding nonexistent categories. --keep-uncategorized Don't remove {{uncategorized}} tags if present. You need to specify at least one page and at least one category to add or remove. The "Category:" prefix is optional. You may provide multiple categories to add or remove by separating them with a pipe character ("|"; don't forget to escape it from the shell) or by using multiple -a or -r options. USAGE # ' # Parse options, print usage message if failed: GetOptions( 'username|user|u=s' => \$username, 'server|s=s' => \$server, 'summary|m=s' => \$summary, 'add|a=s' => \@add, 'remove|r=s' => \@rem, 'files|f=s' => \@files, 'namespace|n=s' => \@namespaces, 'confirm|c' => \$confirm, 'nonexistent|x' => \$nonexistent, 'keep-uncategorized' => \$keep_uncat, ) and (@ARGV or @files) and (@add or @rem) or die $usage; foreach my $file (@files) { open FH, "<", $file or die "Error opening $file: $!\n"; push @ARGV, grep /\S/, ; close FH or die "Error reading from $file: $!\n"; } @ARGV or die "Empty input file(s)!\n"; sub collapse_spaces { my @out = @_; s/^[\s_]+//, s/[\s_]+$//, s/[\s_]+/ /g for @out; return @out; } @namespaces = collapse_spaces map split(/\|/, $_), @namespaces; s/ *:$// for @namespaces; push @namespaces, "Category"; my $catns = $namespaces[0]; # default namespace name to use my $ns_re = join "|", map "\Q$_", @namespaces; $ns_re =~ s/\\ /[\\s_]+/g; $ns_re = qr/$ns_re/i; @add = collapse_spaces map split(/\|/, $_), @add; @rem = collapse_spaces map split(/\|/, $_), @rem; @ARGV = collapse_spaces map split(/\|/, $_), @ARGV; s/^$ns_re *: *// for @add, @rem; my %cat_re; $cat_re{$_} = "\Q$_" for @add, @rem; s/^(\pL)/(?i:$1)/, s/\\ /[\\s_]+/g for values %cat_re; $_ = qr/[^\S\n]* (\n[^\S\n]*)? \[\[ [\s_]* $ns_re [\s_]* : [\s_]* $_ [\s_]* (?:\| [^\[\]]*)? \]\]/x for values %cat_re; my $uncat_re = qr/[^\S\n]* (\n[^\S\n]*)? \{\{ [\s_]* ([Uu]ncat(egori[sz]ed([\s_]+image)?)?|[Nn]ocat|[Nn]eedscategory) [^{}]* \}\}/x; warn "Processing ".@ARGV." pages:\n", map " $_\n", @ARGV; warn "Adding category \"$_\"\n" for @add; warn "Removing category \"$_\"\n" for @rem; # Set up user agent, define subroutine for API queries: my $ua = LWP::UserAgent->new( agent => "Mozilla/4.0 (compatible; $0)", from => 'vyznev@toolserver.org', cookie_jar => {}, parse_head => 0, ); my $apiURI = "http://$server/w/api.php"; sub apireq { my $query = [format => 'xml', map encode_utf8($_), @_]; my $sleep = 1; while (1) { my $res = $ua->post($apiURI, $query); my $err = $res->header('MediaWiki-API-Error') || ""; return XMLin( $res->content ) if $res->is_success and $err ne 'maxlag' and $err ne 'ratelimited'; print STDERR " API request failed, ", ($err || $res->status_line), "..."; if ($sleep > 60*60) { warn "giving up\n"; return XMLin( $res->content ); } warn "sleeping $sleep seconds\n"; sleep $sleep; $sleep *= 2; } } # Check for existence of categories to be added: my @redircats = split "\n", <<"END"; !Redireccionamentos de categorias Cat\x{E9}gorie redirig\x{E9}e Categorii de redirec\x{163}ionare Category redirects Kateg\x{F3}ria\x{E1}tir\x{E1}ny\x{ED}t\x{E1}sok Kategori y\x{F6}nlendirmeleri Kategori yang dialihkan Omdirigeringskategorier Pengalihan kategori Wikipedia Przekierowania kategorii Th\x{1EC3} lo\x{1EA1}i \x{111}\x{1ED5}i h\x{1B0}\x{1EDB}ng Wikipedia category redirects Wikipedia omdirigertekategorier Wikipedia:Categor\x{ED}as redirigidas Wikipedia:Kategori p\x{EB}rcjell\x{EB}se Wikipedia:Kategorienweiterleitung Wikipediako birzuzenketa kategoriak Zastaral\x{E9} kategorie \x{412}\x{438}\x{43A}\x{438}\x{43F}\x{435}\x{434}\x{438}\x{44F}:\x{41A}\x{430}\x{442}\x{435}\x{433}\x{43E}\x{440}\x{438}\x{438}-\x{434}\x{443}\x{431}\x{43B}\x{438}\x{43A}\x{430}\x{442}\x{44B} \x{412}\x{438}\x{43A}\x{438}\x{43F}\x{435}\x{434}\x{438}\x{458}\x{430}:\x{41F}\x{440}\x{435}\x{43D}\x{430}\x{441}\x{43E}\x{447}\x{435}\x{43D}\x{438} \x{43A}\x{430}\x{442}\x{435}\x{433}\x{43E}\x{440}\x{438}\x{438} \x{41F}\x{435}\x{440}\x{435}\x{43D}\x{430}\x{43F}\x{440}\x{430}\x{432}\x{43B}\x{435}\x{43D}\x{43D}\x{44F} \x{43A}\x{430}\x{442}\x{435}\x{433}\x{43E}\x{440}\x{456}\x{439} \x{412}\x{456}\x{43A}\x{456}\x{43F}\x{435}\x{434}\x{456}\x{457} \x{5D5}\x{5D5}\x{5D9}\x{5E7}\x{5D9}\x{5E4}\x{5E2}\x{5D3}\x{5D9}\x{5E2} \x{5E7}\x{5D0}\x{5D8}\x{5E2}\x{5D2}\x{5D0}\x{5E8}\x{5D9}\x{5E2} \x{5D0}\x{5E8}\x{5D9}\x{5D1}\x{5E2}\x{5E8}\x{5E4}\x{5D9}\x{5E8}\x{5DF} \x{5DF2}\x{91CD}\x{5B9A}\x{5411}\x{7684}\x{5206}\x{7C7B} \x{62A}\x{62D}\x{648}\x{64A}\x{644}\x{627}\x{62A} \x{62A}\x{635}\x{646}\x{64A}\x{641}\x{627}\x{62A} \x{648}\x{64A}\x{643}\x{64A}\x{628}\x{64A}\x{62F}\x{64A}\x{627} \x{62A}\x{62D}\x{648}\x{64A}\x{644}\x{627}\x{62A} \x{62A}\x{635}\x{646}\x{64A}\x{641}\x{627}\x{62A} \x{648}\x{64A}\x{643}\x{64A}\x{628}\x{64A}\x{62F}\x{64A}\x{627} \x{631}\x{62F}\x{647}\x{200C}\x{647}\x{627}\x{6CC} \x{645}\x{646}\x{62A}\x{642}\x{644} \x{634}\x{62F}\x{647} \x{7DAD}\x{57FA}\x{767E}\x{79D1}\x{985E}\x{8DF3}\x{8F49} \x{935}\x{93F}\x{915}\x{93F}\x{92A}\x{940}\x{921}\x{93F}\x{92F}\x{93E} \x{936}\x{94D}\x{930}\x{947}\x{923}\x{940} \x{905}\x{928}\x{941}\x{92A}\x{94D}\x{930}\x{947}\x{937}\x{93F}\x{924} \x{C704}\x{D0A4}\x{BC31}\x{ACFC} \x{BD84}\x{B958} \x{B118}\x{ACA8}\x{C8FC}\x{AE30} END if (@add) { my $cats = apireq( action => "query", prop => "info|categories", titles => join("|", map "$catns:$_", @add), clcategories => join("|", map "Category:$_", @redircats), ); # warn Dumper($cats); my $pages = $cats->{query}{pages}{page}; $pages = [$pages] if ref($pages) ne 'ARRAY'; my $badcats = 0; foreach my $page (@$pages) { my $title = $page->{title}; if (exists $page->{invalid}) { warn " WARNING: [[$title]] is not a valid title!\n"; } elsif (exists $page->{missing}) { warn " WARNING: [[$title]] does not exist!\n"; } elsif (exists $page->{redirect}) { warn " WARNING: [[$title]] is a redirect!\n"; } elsif ($page->{ns} != 14) { warn " WARNING: [[$title]] is not a category! (WTF?)\n"; } elsif ($page->{categories}) { warn " WARNING: [[$title]] appears to be a category redirect!\n"; } else { next } $badcats++; } if ($badcats) { if ($nonexistent) { warn " Continuing anyway as requested.\n"; } else { while (1) { print STDERR " Continue anyway? (Y/N): "; my $reply = ; last if $reply =~ /^y(es)?$/i; exit if $reply =~ /^no?$/i; } } } } # Read password from stdin and log in: ReadMode 'noecho'; print STDERR "Password for $username \@ $server: "; my $pass = ; chomp $pass; print STDERR "\n"; ReadMode 'restore'; warn "Logging in to $server as $username...\n"; my $login = apireq( action => 'login', lgname => $username, lgpassword => $pass ); $login->{error} and die "Login as $username failed ($login->{error}{code}): $login->{error}{info}\n"; $login->{login}{result} eq 'Success' or die "Login as $username failed: $login->{login}{result}\n"; # Do the edits: PAGE: foreach my $title (@ARGV) { warn "Fetching content for \"$title\"...\n"; my $starttime = time; my $data = apireq( action => 'query', prop => 'info|revisions', intoken => 'edit', rvprop => 'content|timestamp', titles => $title, ); my $token = $data->{query}{pages}{page}{edittoken} or die "Failed to get edit token, got:\n", Dumper($data), "\n"; my $content = $data->{query}{pages}{page}{revisions}{rev}{content} or die "Failed to get content, got:\n", Dumper($data), "\n"; my $timestamp = $data->{query}{pages}{page}{revisions}{rev}{timestamp} or die "Failed to get timestamp, got:\n", Dumper($data), "\n"; my @removed; foreach my $cat (@rem) { my $n = ($content =~ s/$cat_re{$cat}//g) and push @removed, $cat; warn " Could not locate category \"$cat\" to remove.\n" unless $n; warn " $n instances of category \"$cat\" removed.\n" if $n > 1; } my @added; foreach my $cat (@add) { my $n = ($content =~ s/$cat_re{$cat}//g) or push @added, $cat; warn " Category \"$cat\" already found, not adding.\n" if $n; } $content =~ s/\n?$/\n/; $content .= join "", map "[[$catns:$_]]\n", @added; my $uncat; if (@added and not $keep_uncat) { $uncat = ($content =~ s/$uncat_re//); warn " $uncat instance(s) of {{uncategorized}} removed.\n" if $uncat; } my @summary; push @summary, "added ".join(", ", map "[[$catns:$_]]", @added) if @added; push @summary, "removed ".join(", ", map "[[$catns:$_]]", @removed) if @removed; push @summary, "removed {{uncategorized}} template" if $uncat; my $autosummary = join "; ", @summary; unless (@summary) { warn " Nothing to do, skipping!\n"; next PAGE; } if ($confirm) { warn "--- Difference between revisions: ---\n"; # Generate and print diff my @old = split /\n/, $data->{query}{pages}{page}{revisions}{rev}{content}; my @new = split /\n/, $content; my @cdiff = compact_diff( \@old, \@new ); my $ctx = 3; while (@cdiff > 2) { my ($ca, $cb, $sa, $sb, $ea, $eb) = @cdiff; if ($ca < $sa - 2*$ctx or !$ca and !$cb) { my $len = $sa - $ca; $len = $ctx if $len > $ctx; print STDERR map " $_\n", @old[$ca .. $ca+$len-1] if $ca or $cb; print STDERR "...\n"; print STDERR map " $_\n", @old[$sa-$len .. $sa-1] if @cdiff > 4; } else { print STDERR map " $_\n", @old[$ca .. $sa-1]; } splice @cdiff, 0, 4; last unless @cdiff; print STDERR map "-$_\n", @old[$sa .. $ea-1]; print STDERR map "+$_\n", @new[$sb .. $eb-1]; } while (1) { print STDERR " Okay to save this edit? (Y/N): "; my $reply = ; last if $reply =~ /^y(es)?$/i; next PAGE if $reply =~ /^no?$/i; } } warn " Saving: $autosummary ...\n"; my $edit = apireq( action => 'edit', text => $content, md5 => md5_hex(encode_utf8($content)), title => $title, basetimestamp => $timestamp, lasttimestamp => strftime("%Y-%m-%dT%H:%M:%S", gmtime $starttime), summary => $summary || $autosummary, minor => 1, bot => 1, token => $token, ); if ($edit->{error}) { if ($edit->{error}{code} eq 'editconflict') { warn " Edit conflict, retrying!\n"; redo PAGE; } die " API Error ($edit->{error}{code}): $edit->{error}{info}\n"; } elsif ($edit->{edit}{result} ne 'Success') { die " Edit failed ($edit->{edit}{result}):\n", Dumper($edit), "\n"; } } __DATA__