#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This bot will standardize footnote references. It will retrieve information on
which pages might need changes either from an SQL dump (no longer supported)
or a text file, or only change a single page.
At present it converts to [[Wikipedia:Footnote3]] format (ref/note).
NOTE: This script is not capable of handling the syntax. It just
handles the {{ref}} syntax, which is still used, but DEPRECATED on the English
Wikipedia.
You can run the bot with the following commandline parameters:
-file - Work on all pages given in a local text file.
Will read any [[wiki link]] and use these articles.
Argument can also be given as "-file:filename".
-cat - Work on all pages which are in a specific category.
Argument can also be given as "-cat:categoryname".
-page - Only edit a single page.
Argument can also be given as "-page:pagename". You can give this
parameter multiple times to edit multiple pages.
-regex - Make replacements using regular expressions. (Obsolete; always True)
-except:XYZ - Ignore pages which contain XYZ. If the -regex argument is given,
XYZ will be regarded as a regular expression.
-namespace:n - Namespace to process. Works only with a sql dump
-always - Don't prompt you for each replacement
other: - First argument is the old text, second argument is the new text.
If the -regex argument is given, the first argument will be
regarded as a regular expression, and the second argument might
contain expressions like \\1 or \g.
NOTE: Only use either -sql or -file or -page, but don't mix them.
"""
# Derived from replace.py
#
# (C) Daniel Herding, 2004
# Copyright Scot E. Wilcoxon 2005
#
# Distributed under the terms of the MIT license.
#
__version__ = '$Id: standardize_notes.py 4537 2007-11-13 17:05:02Z leogregianin $'
#
# 2005-07-15: Find name of section containing citations: doFindRefSection(). (SEWilco)
# 2005-07-15: Obey robots.txt restrictions. (SEWilco)
# 2005-07-15: Build list of all sections which may contain citations: doFindAllCitationSections(). (SEWilco)
#
#from __future__ import generators
import subprocess, sys, re, random
import socket, urllib, robotparser
import wikipedia, pagegenerators
from datetime import date
import cgitb; cgitb.enable()
# httpcache is optional
have_httpcache = True
try:
from httpcache import HTTPCache
except ImportError:
have_httpcache = False
# Summary messages in different languages
msg = {
'de':u'Bot: Automatisierte Textersetzung %s',
'en':u'Robot: Automated reference processing %s',
'es':u'Robot: Reemplazo automático de texto %s',
'fr':u'Bot : Remplacement de texte automatisé %s',
'he':u'בוט: הופך את הערת השוליים %s לאוטומטית',
'hu':u'Robot: Automatikus szövegcsere %s',
'ia':u'Robot: Reimplaciamento automatic de texto %s',
'is':u'Vélmenni: breyti texta %s',
'pl':u'Robot automatycznie przetwarza źródła %s',
'pt':u'Bot: Mudança automática %s',
}
fixes = {
# These replacements will convert alternate reference formats to format used by this tool.
'ALTREFS': {
'regex': True,
# We don't want to mess up pages which discuss HTML tags, so we skip
# all pages which contain nowiki tags.
'exceptions': ['[^<]{3,}'],
'msg': {
'en':u'Robot: Adding/sorting references.',
'he':u'בוט: מוסיף/מסדר הערות שוליים',
'ia':u'Robot: Addition/assortimento de referentias',
'pl':u'Robot dodaje/sortuje źródła',
},
'replacements': [
# Everything case-insensitive (?i)
# These translate variations of footnote templates to ref|note format.
(r'(?i){{an\|(.*?)}}', r"{{ref|\1}}"),
(r'(?i){{anb\|(.*?)}}', r"{{note|\1}}"),
(r'(?i){{endnote\|(.*?)}}', r"{{note|\1}}"),
(r'(?i){{fn\|(.*?)}}', r"{{ref|fn\1}}"),
(r'(?i){{fnb\|(.*?)}}', r"{{note|fn\1}}"),
(r'(?i){{namedref\|(.*?)\|.*?}}', r"{{ref|fn\1}}"),
(r'(?i){{namednote\|(.*?)\|.*?}}', r"{{note|fn\1}}"),
# subst: fn and fnb
(r'(?i)[[][[]#fn(.*?)[|][0-9a-z]*[]][]]', r"{{ref|fn\1}}"),
(r'(?i)[[][[]#fn.*?[]][]]', r"{{note|fn_\1}}"),
(r'(?i){{mn\|(.*?)\|(.*?)}}', r"{{ref|mn\1_\2}}"),
(r'(?i){{mnb\|(.*?)\|(.*?)}}', r"{{note|mn\1_\2}}"),
# a header where only spaces are in the same line
(r'(?i)([\r\n]) * *([^<]+?) *
*([\r\n])', r"\1= \2 =\3"),
(r'(?i)([\r\n]) * *([^<]+?) *
*([\r\n])', r"\1== \2 ==\3"),
(r'(?i)([\r\n]) * *([^<]+?) *
*([\r\n])', r"\1=== \2 ===\3"),
(r'(?i)([\r\n]) * *([^<]+?) *
*([\r\n])', r"\1==== \2 ====\3"),
(r'(?i)([\r\n]) * *([^<]+?) *
*([\r\n])', r"\1===== \2 =====\3"),
(r'(?i)([\r\n]) * *([^<]+?) *
*([\r\n])', r"\1====== \2 ======\3"),
# A bare http URL; does not recognize all formats
#(r'(?i) http://([^ ]*)', r" [http://\1]"),
]
}
}
# names of reference section names
referencesectionnames = [
'bibliography',
'citation',
'citations',
'external link',
'external links',
'external links and references',
'footnotes',
'links',
'notes',
'notes and references',
'reference',
'references',
'source',
'sources',
]
# news sites for which to generate 'news reference' citations, the org name, and prefix to strip
newssites = [
( 'abcnews.go.com', 'ABC News', 'ABC News: ' ),
( 'books.guardian.co.uk', 'The Guardian', 'Guardian Unlimited : The Guardian : ' ),
( 'edition.cnn.com', 'CNN', 'CNN.com - ' ),
( 'news.bbc.co.uk', 'BBC', 'BBC NEWS : ' ),
( 'news.scotsman.com', 'The Scotsman', 'Scotsman.com News - ' ),
( 'nyobserver.com', 'New York Observer', '' ),
( 'observer.guardian.co.uk', 'The Guardian', 'The Observer : ' ),
( 'politics.guardian.co.uk', 'The Guardian', 'Guardian Unlimited Politics : ' ),
( 'seattletimes.nwsource.com', 'The Seattle Times', 'The Seattle Times: ' ),
( 'service.spiegel.de', 'Der Spiegel', '' ),
( 'thescotsman.scotsman.com', 'The Scotsman', 'The Scotsman - ' ),
( 'today.reuters.com', 'Reuters', 'Latest News and Financial Information : ' ),
( 'today.reuters.co.uk', 'Reuters', 'Latest News and Financial Information : ' ),
( 'www.boston.com', 'The Boston Globe', 'Boston.com / ' ),
( 'www.cbsnews.com', 'CBS News', 'CBS News : ' ),
( 'www.cnn.com', 'CNN', 'CNN.com - ' ),
( 'www.cnsnews.com', 'Cybercast News Service', '' ),
( 'www.csmonitor.com', 'Christian Science Monitor', '' ),
( 'www.dallasnews.com', 'The Dallas Morning News', '' ),
( 'www.forbes.com', 'Forbes', '' ),
( 'www.foxnews.com', 'Fox News Channel', 'FOXNews.com - ' ),
( 'www.gnn.com', 'Government News Network', 'GNN - ' ),
( 'www.guardian.co.uk', 'The Guardian', 'Guardian Unlimited : The Guardian : ' ),
( 'www.latimes.com', 'Los Angeles Times', '' ),
( 'www.msnbc.msn.com', 'MSNBC', '' ),
( 'www.nationalreview.com', 'National Review', '' ),
( 'www.nytimes.com', 'The New York Times', '' ),
( 'www.sfgate.com', 'San Francisco Chronicle', '' ),
( 'www.socialistworker.co.uk', 'Socialist Worker', '' ),
( 'www.spectator.org', 'The American Spectator', '' ),
( 'www.telegraph.co.uk', 'The Daily Telegraph', 'Telegraph newspaper online - ' ),
( 'www.time.com', 'TIME', '' ),
( 'www.timesonline.co.uk', 'The Times', 'World news from The Times and the Sunday Times - ' ),
( 'www.usatoday.com', 'USA Today', 'USATODAY.com - ' ),
( 'www.washingtonpost.com', 'The Washington Post', '' ),
( 'www.washtimes.com', 'The Washington Times', '' ),
( 'www.weeklystandard.com', 'The Weekly Standard', '' ),
( 'www.wired.com', 'Wired magazine', 'Wired News: ' ),
( 'wwwimage.cbsnews.com', 'CBS News', 'CBS News : ' ),
]
class ReplacePageGenerator:
"""
Generator which will yield Pages for pages that might contain text to
replace. These pages might be retrieved from a local SQL dump file or a
text file, or as a list of pages entered by the user.
Arguments:
* source - Where the bot should retrieve the page list from.
Can be 'sqldump', 'textfile' or 'userinput'.
* replacements - A dictionary where keys are original texts and values
are replacement texts.
* exceptions - A list of strings; pages which contain one of these
won't be changed.
* regex - If the entries of replacements and exceptions should
be interpreted as regular expressions
* namespace - Namespace to process in case of a SQL dump. -1 means
that all namespaces should be searched.
* textfilename - The textfile's path, either absolute or relative, which
will be used when source is 'textfile'.
* sqlfilename - The dump's path, either absolute or relative, which
will be used when source is 'sqldump'.
* pagenames - a list of pages which will be used when source is
'userinput'.
"""
def __init__(self, source, replacements, exceptions, regex = False, namespace = -1, textfilename = None, sqlfilename = None, categoryname = None, pagenames = None):
self.source = source
self.replacements = replacements
self.exceptions = exceptions
self.regex = regex
self.namespace = namespace
self.textfilename = textfilename
self.sqlfilename = sqlfilename
self.categoryname = categoryname
self.pagenames = pagenames
def read_pages_from_sql_dump(self):
"""
Generator which will yield Pages to pages that might contain text to
replace. These pages will be retrieved from a local sql dump file
(cur table).
Arguments:
* sqlfilename - the dump's path, either absolute or relative
* replacements - a dictionary where old texts are keys and new texts
are values
* exceptions - a list of strings; pages which contain one of these
won't be changed.
* regex - if the entries of replacements and exceptions should
be interpreted as regular expressions
"""
mysite = wikipedia.getSite()
import sqldump
dump = sqldump.SQLdump(self.sqlfilename, wikipedia.getSite().encoding())
for entry in dump.entries():
skip_page = False
if self.namespace != -1 and self.namespace != entry.namespace:
continue
else:
for exception in self.exceptions:
if self.regex:
exception = re.compile(exception)
if exception.search(entry.text):
skip_page = True
break
else:
if entry.text.find(exception) != -1:
skip_page = True
break
if not skip_page:
for old, new in self.replacements:
if self.regex:
old = re.compile(old)
if old.search(entry.text):
yield wikipedia.Page(mysite, entry.full_title())
break
else:
if entry.text.find(old) != -1:
yield wikipedia.Page(mysite, entry.full_title())
break
def read_pages_from_category(self):
"""
Generator which will yield pages that are listed in a text file created by
the bot operator. Will regard everything inside [[double brackets]] as a
page name, and yield Pages for these pages.
Arguments:
* textfilename - the textfile's path, either absolute or relative
"""
import catlib
category = catlib.Category(wikipedia.getSite(), self.categoryname)
for page in category.articles(recurse = False):
yield page
def read_pages_from_text_file(self):
"""
Generator which will yield pages that are listed in a text file created by
the bot operator. Will regard everything inside [[double brackets]] as a
page name, and yield Pages for these pages.
Arguments:
* textfilename - the textfile's path, either absolute or relative
"""
f = open(self.textfilename, 'r')
# regular expression which will find [[wiki links]]
R = re.compile(r'.*\[\[([^\]]*)\]\].*')
m = False
for line in f.readlines():
# BUG: this will only find one link per line.
# TODO: use findall() instead.
m=R.match(line)
if m:
yield wikipedia.Page(wikipedia.getSite(), m.group(1))
f.close()
def read_pages_from_wiki_page(self):
'''
Generator which will yield pages that are listed in a wiki page. Will
regard everything inside [[double brackets]] as a page name, except for
interwiki and category links, and yield Pages for these pages.
Arguments:
* pagetitle - the title of a page on the home wiki
'''
listpage = wikipedia.Page(wikipedia.getSite(), self.pagetitle)
list = wikipedia.get(listpage)
# TODO - UNFINISHED
# TODO: Make MediaWiki's search feature available.
def __iter__(self):
'''
Starts the generator.
'''
if self.source == 'sqldump':
for pl in self.read_pages_from_sql_dump():
yield pl
elif self.source == 'textfile':
for pl in self.read_pages_from_text_file():
yield pl
elif self.source == 'category':
for pl in self.read_pages_from_category():
yield pl
elif self.source == 'userinput':
for pagename in self.pagenames:
yield wikipedia.Page(wikipedia.getSite(), pagename)
class ReplaceRobot:
def __init__(self, generator, replacements, refsequence, references, refusage, exceptions = [], regex = False, acceptall = False):
self.generator = generator
self.replacements = replacements
self.exceptions = exceptions
self.regex = regex
self.acceptall = acceptall
self.references = references
self.refsequence = refsequence
self.refusage = refusage
def checkExceptions(self, original_text):
"""
If one of the exceptions applies for the given text, returns the
substring. which matches the exception. Otherwise it returns None.
"""
for exception in self.exceptions:
if self.regex:
exception = re.compile(exception)
hit = exception.search(original_text)
if hit:
return hit.group(0)
else:
hit = original_text.find(exception)
if hit != -1:
return original_text[hit:hit + len(exception)]
return None
def doReplacements(self, new_text):
"""
Returns the text which is generated by applying all replacements to the
given text.
"""
# For any additional replacements, loop through them
for old, new in self.replacements:
if self.regex:
# TODO: compiling the regex each time might be inefficient
oldR = re.compile(old)
new_text = oldR.sub(new, new_text)
else:
new_text = new_text.replace(old, new)
# Find name of Notes section.
refsectionname = self.doFindRefSection( new_text )
# Get list of all sections which may contain citations.
refsectionlist = self.doFindAllCitationSections( new_text, refsectionname )
# Read existing Notes section contents into references list
wikipedia.output( u"Reading existing Notes section" )
self.doReadReferencesSection( new_text, refsectionname )
while self.references and self.references[len(self.references)-1] == u'\n':
del self.references[len(self.references)-1] # delete trailing empty lines
# Convert any external links to footnote references
wikipedia.output( u"Converting external links" )
new_text = self.doConvertExternalLinks( new_text )
# Accumulate ordered list of all references
wikipedia.output( u"Collecting references" )
(duplicatefound, self.refusage) = self.doBuildSequenceListOfReferences( new_text )
# Rewrite references, including dealing with duplicates.
wikipedia.output( u"Rewriting references" )
new_text = self.doRewriteReferences( new_text, self.refusage, refsectionname )
# Reorder Notes to match sequence of ordered list
wikipedia.output( u"Collating references" )
self.references = self.doReorderReferences( self.references, self.refusage)
# Rebuild Notes section
wikipedia.output( u"Rebuilding References section" )
new_text = self.doUpdateReferencesSection( new_text, self.refusage, refsectionname )
return new_text
def doConvertExternalLinks(self, original_text):
"""
Returns the text which is generated by converting external links to References.
Adds References to reference list.
"""
new_text = '' # Default is no text
skipsection = False
for text_line in original_text.splitlines(True): # Scan all text line by line
# Check for protected sections
m = re.search("== *(?P[^\]\|=]*) *==", text_line)
# TODO: support subheadings within Notes section
# TODO: support Notes in alphabetic order
# TODO: support Notes in other orders
if m: # if in a section, check if should skip this section
if m.group('sectionname').lower().strip() in referencesectionnames:
skipsection = True # skipsection left True so no further links converted
if skipsection:
new_text = new_text + text_line # skip section, so retain text.
else:
# TODO: recognize {{inline}} invisible footnotes when something can be done with them
#
# Ignore lines within comments
if not text_line.startswith( u'': # This line ends some Notes sections
intargetsection = False # flag as not being in section
if text_line.strip() == u'': # This line ends some Notes sections
intargetsection = False # flag as not being in section
if intargetsection: # if still inside target section
# Convert any # wiki list to *; will be converted later if a reference
if text_line[0] == '#':
text_line = '*' + text_line[1:] # replace # with * wiki
self.references.append( text_line.rstrip() + u'\n' ) # Append line to references
new_text = new_text + text_line.rstrip() + u'\n'
return new_text
def doReorderReferences(self, references, refusage):
"""
Returns the new references list after reordering to match refusage list
Non-references are moved to top, unused references to bottom.
"""
# TODO: add tests for duplicate references/Ibid handling.
newreferences = references
if references != [] and refusage != {}:
newreferences = []
for i in range(len(references)): # move nonrefs to top of list
text_line = references[i]
# TODO: compile search?
m = re.search(r'(?i)[*#][\s]*{{(?Pnote)\|(?P[^}|]+?)}}', text_line)
# Special test to ignore Footnote instructions comment.
text_line_stripped = text_line.strip()
if text_line_stripped.startswith(u'4) Add ') or not m: # if no ref found
newreferences.append(text_line) # add nonref to new list
references[i] = None
refsort = {}
for refkey in refusage.keys(): # build list of keys in document order
refsort[ refusage[refkey][0] ] = refkey # refsort contains reference key names
alphabet26 = u'abcdefghijklmnopqrstuvwxyz'
for i in range(len(refsort)): # collect references in document order
for search_num in range(len(references)): # find desired entry
search_line = references[search_num]
if search_line:
# TODO: compile search?
# Note that the expression finds all neighboring note|note_label expressions.
m2 = re.search(r'(?i)[*#]([\s]*{{(?Pnote|note_label)\|(?P[^}|]+?)}})+', search_line)
if m2:
refkey = m2.group('refname').strip()
if refkey == refsort[i]: # if expected ref found
# Rewrite references
note_text = '# {{note|%s}}' % refkey # rewrite note tag
if refusage[refkey][1] > 1: # if more than one reference to citation
for n in range(refusage[refkey][1]): # loop through all repetitions
note_text = note_text + '{{note_label|%s|%d|%s}}' % (refkey,(refusage[refkey][0])+1,alphabet26[n%26])
search_line=search_line[:m2.start(0)] + note_text + search_line[m2.end(0):]
newreferences.append(search_line) # found, add entry
del references[search_num] # delete used reference
break # stop the search loop after entry found
newreferences = newreferences + references # append any unused references
return newreferences
def doUpdateReferencesSection(self, original_text, refusage, refsectionname):
"""
Returns the text which is generated by rebuilding the Notes section.
Rewrite Notes section from references list.
"""
new_text = '' # Default is no text
intargetsection = False
for text_line in original_text.splitlines(True): # Scan all text line by line
# Check for target section
m = re.search( r'==+(?P[^=]+)==', text_line )
if m: # if in a section, check if Notes section
if refsectionname != '': # if a certain section name has been identified
m_section = m.group('sectionname')
wikipedia.output( u'Looking for "%s": "%s"' % (refsectionname,m_section) )
if unicode(m_section.strip()) == unicode(refsectionname):
wikipedia.output( u'Updating Ref section.' )
intargetsection = True # flag as being in section
else:
intargetsection = False # flag as not being in section
else: # else grab all possible sections
if m.group('sectionname').lower().strip() in referencesectionnames:
intargetsection = True # flag as being in section
else:
intargetsection = False # flag as not being in section
if intargetsection:
new_text = new_text + text_line # append new line to new text
if self.references != []:
for newref in self.references: # scan through all references
if newref != None:
new_text = new_text + newref.rstrip() + u'\n' # insert references
new_text = new_text + u'\n' # one trailing blank line
self.references = [] # empty references
else:
new_text = new_text + text_line # copy section headline
else:
if intargetsection:
if text_line.strip() != '':
if text_line.lstrip()[0] in u'[{': # if line starts with non-Ref WikiSyntax
intargetsection = False # flag as not being in section
# TODO: need better way to handle special cases at end of refs
if text_line.strip() == u'': # This line ends some Notes sections
intargetsection = False # flag as not being in section
if text_line.strip() == u'': # This line ends some Notes sections
intargetsection = False # flag as not being in section
if not intargetsection: # if not in Notes section, remember line
new_text = new_text + text_line # append new line to new text
# If references list not emptied, there was no Notes section found
if self.references != []: # empty references
# New Notes section needs to be created at bottom.
text_line_counter = 0 # current line
last_text_line_counter_value = 0 # number of last line of possible text
for text_line in original_text.splitlines(True): # Search for last normal text line
text_line_counter += 1 # count this line
if text_line.strip() != '':
if text_line.lstrip()[0].isalnum(): # if line starts with alphanumeric
last_text_line_counter = text_line_counter # number of last line of possible text
else:
if text_line.lstrip()[0] in u'<=!|*#': # if line starts with recognized wiki char
if not text_line.startswith(u'