#!/usr/bin/env python # -*- coding: utf-8 -*- """ This bot will standardize footnote references. It will retrieve information on which pages might need changes either from an SQL dump (no longer supported) or a text file, or only change a single page. At present it converts to [[Wikipedia:Footnote3]] format (ref/note). NOTE: This script is not capable of handling the syntax. It just handles the {{ref}} syntax, which is still used, but DEPRECATED on the English Wikipedia. You can run the bot with the following commandline parameters: -file - Work on all pages given in a local text file. Will read any [[wiki link]] and use these articles. Argument can also be given as "-file:filename". -cat - Work on all pages which are in a specific category. Argument can also be given as "-cat:categoryname". -page - Only edit a single page. Argument can also be given as "-page:pagename". You can give this parameter multiple times to edit multiple pages. -regex - Make replacements using regular expressions. (Obsolete; always True) -except:XYZ - Ignore pages which contain XYZ. If the -regex argument is given, XYZ will be regarded as a regular expression. -namespace:n - Namespace to process. Works only with a sql dump -always - Don't prompt you for each replacement other: - First argument is the old text, second argument is the new text. If the -regex argument is given, the first argument will be regarded as a regular expression, and the second argument might contain expressions like \\1 or \g. NOTE: Only use either -sql or -file or -page, but don't mix them. """ # Derived from replace.py # # (C) Daniel Herding, 2004 # Copyright Scot E. Wilcoxon 2005 # # Distributed under the terms of the MIT license. # __version__ = '$Id: standardize_notes.py 4537 2007-11-13 17:05:02Z leogregianin $' # # 2005-07-15: Find name of section containing citations: doFindRefSection(). (SEWilco) # 2005-07-15: Obey robots.txt restrictions. (SEWilco) # 2005-07-15: Build list of all sections which may contain citations: doFindAllCitationSections(). (SEWilco) # #from __future__ import generators import subprocess, sys, re, random import socket, urllib, robotparser import wikipedia, pagegenerators from datetime import date import cgitb; cgitb.enable() # httpcache is optional have_httpcache = True try: from httpcache import HTTPCache except ImportError: have_httpcache = False # Summary messages in different languages msg = { 'de':u'Bot: Automatisierte Textersetzung %s', 'en':u'Robot: Automated reference processing %s', 'es':u'Robot: Reemplazo automático de texto %s', 'fr':u'Bot : Remplacement de texte automatisé %s', 'he':u'בוט: הופך את הערת השוליים %s לאוטומטית', 'hu':u'Robot: Automatikus szövegcsere %s', 'ia':u'Robot: Reimplaciamento automatic de texto %s', 'is':u'Vélmenni: breyti texta %s', 'pl':u'Robot automatycznie przetwarza źródła %s', 'pt':u'Bot: Mudança automática %s', } fixes = { # These replacements will convert alternate reference formats to format used by this tool. 'ALTREFS': { 'regex': True, # We don't want to mess up pages which discuss HTML tags, so we skip # all pages which contain nowiki tags. 'exceptions': ['[^<]{3,}'], 'msg': { 'en':u'Robot: Adding/sorting references.', 'he':u'בוט: מוסיף/מסדר הערות שוליים', 'ia':u'Robot: Addition/assortimento de referentias', 'pl':u'Robot dodaje/sortuje źródła', }, 'replacements': [ # Everything case-insensitive (?i) # These translate variations of footnote templates to ref|note format. (r'(?i){{an\|(.*?)}}', r"{{ref|\1}}"), (r'(?i){{anb\|(.*?)}}', r"{{note|\1}}"), (r'(?i){{endnote\|(.*?)}}', r"{{note|\1}}"), (r'(?i){{fn\|(.*?)}}', r"{{ref|fn\1}}"), (r'(?i){{fnb\|(.*?)}}', r"{{note|fn\1}}"), (r'(?i){{namedref\|(.*?)\|.*?}}', r"{{ref|fn\1}}"), (r'(?i){{namednote\|(.*?)\|.*?}}', r"{{note|fn\1}}"), # subst: fn and fnb (r'(?i)[[][[]#fn(.*?)[|][0-9a-z]*[]][]]', r"{{ref|fn\1}}"), (r'(?i)[[][[]#fn.*?[]][]]', r"{{note|fn_\1}}"), (r'(?i){{mn\|(.*?)\|(.*?)}}', r"{{ref|mn\1_\2}}"), (r'(?i){{mnb\|(.*?)\|(.*?)}}', r"{{note|mn\1_\2}}"), # a header where only spaces are in the same line (r'(?i)([\r\n]) *

*([^<]+?) *

*([\r\n])', r"\1= \2 =\3"), (r'(?i)([\r\n]) *

*([^<]+?) *

*([\r\n])', r"\1== \2 ==\3"), (r'(?i)([\r\n]) *

*([^<]+?) *

*([\r\n])', r"\1=== \2 ===\3"), (r'(?i)([\r\n]) *

*([^<]+?) *

*([\r\n])', r"\1==== \2 ====\3"), (r'(?i)([\r\n]) *
*([^<]+?) *
*([\r\n])', r"\1===== \2 =====\3"), (r'(?i)([\r\n]) *
*([^<]+?) *
*([\r\n])', r"\1====== \2 ======\3"), # A bare http URL; does not recognize all formats #(r'(?i) http://([^ ]*)', r" [http://\1]"), ] } } # names of reference section names referencesectionnames = [ 'bibliography', 'citation', 'citations', 'external link', 'external links', 'external links and references', 'footnotes', 'links', 'notes', 'notes and references', 'reference', 'references', 'source', 'sources', ] # news sites for which to generate 'news reference' citations, the org name, and prefix to strip newssites = [ ( 'abcnews.go.com', 'ABC News', 'ABC News: ' ), ( 'books.guardian.co.uk', 'The Guardian', 'Guardian Unlimited : The Guardian : ' ), ( 'edition.cnn.com', 'CNN', 'CNN.com - ' ), ( 'news.bbc.co.uk', 'BBC', 'BBC NEWS : ' ), ( 'news.scotsman.com', 'The Scotsman', 'Scotsman.com News - ' ), ( 'nyobserver.com', 'New York Observer', '' ), ( 'observer.guardian.co.uk', 'The Guardian', 'The Observer : ' ), ( 'politics.guardian.co.uk', 'The Guardian', 'Guardian Unlimited Politics : ' ), ( 'seattletimes.nwsource.com', 'The Seattle Times', 'The Seattle Times: ' ), ( 'service.spiegel.de', 'Der Spiegel', '' ), ( 'thescotsman.scotsman.com', 'The Scotsman', 'The Scotsman - ' ), ( 'today.reuters.com', 'Reuters', 'Latest News and Financial Information : ' ), ( 'today.reuters.co.uk', 'Reuters', 'Latest News and Financial Information : ' ), ( 'www.boston.com', 'The Boston Globe', 'Boston.com / ' ), ( 'www.cbsnews.com', 'CBS News', 'CBS News : ' ), ( 'www.cnn.com', 'CNN', 'CNN.com - ' ), ( 'www.cnsnews.com', 'Cybercast News Service', '' ), ( 'www.csmonitor.com', 'Christian Science Monitor', '' ), ( 'www.dallasnews.com', 'The Dallas Morning News', '' ), ( 'www.forbes.com', 'Forbes', '' ), ( 'www.foxnews.com', 'Fox News Channel', 'FOXNews.com - ' ), ( 'www.gnn.com', 'Government News Network', 'GNN - ' ), ( 'www.guardian.co.uk', 'The Guardian', 'Guardian Unlimited : The Guardian : ' ), ( 'www.latimes.com', 'Los Angeles Times', '' ), ( 'www.msnbc.msn.com', 'MSNBC', '' ), ( 'www.nationalreview.com', 'National Review', '' ), ( 'www.nytimes.com', 'The New York Times', '' ), ( 'www.sfgate.com', 'San Francisco Chronicle', '' ), ( 'www.socialistworker.co.uk', 'Socialist Worker', '' ), ( 'www.spectator.org', 'The American Spectator', '' ), ( 'www.telegraph.co.uk', 'The Daily Telegraph', 'Telegraph newspaper online - ' ), ( 'www.time.com', 'TIME', '' ), ( 'www.timesonline.co.uk', 'The Times', 'World news from The Times and the Sunday Times - ' ), ( 'www.usatoday.com', 'USA Today', 'USATODAY.com - ' ), ( 'www.washingtonpost.com', 'The Washington Post', '' ), ( 'www.washtimes.com', 'The Washington Times', '' ), ( 'www.weeklystandard.com', 'The Weekly Standard', '' ), ( 'www.wired.com', 'Wired magazine', 'Wired News: ' ), ( 'wwwimage.cbsnews.com', 'CBS News', 'CBS News : ' ), ] class ReplacePageGenerator: """ Generator which will yield Pages for pages that might contain text to replace. These pages might be retrieved from a local SQL dump file or a text file, or as a list of pages entered by the user. Arguments: * source - Where the bot should retrieve the page list from. Can be 'sqldump', 'textfile' or 'userinput'. * replacements - A dictionary where keys are original texts and values are replacement texts. * exceptions - A list of strings; pages which contain one of these won't be changed. * regex - If the entries of replacements and exceptions should be interpreted as regular expressions * namespace - Namespace to process in case of a SQL dump. -1 means that all namespaces should be searched. * textfilename - The textfile's path, either absolute or relative, which will be used when source is 'textfile'. * sqlfilename - The dump's path, either absolute or relative, which will be used when source is 'sqldump'. * pagenames - a list of pages which will be used when source is 'userinput'. """ def __init__(self, source, replacements, exceptions, regex = False, namespace = -1, textfilename = None, sqlfilename = None, categoryname = None, pagenames = None): self.source = source self.replacements = replacements self.exceptions = exceptions self.regex = regex self.namespace = namespace self.textfilename = textfilename self.sqlfilename = sqlfilename self.categoryname = categoryname self.pagenames = pagenames def read_pages_from_sql_dump(self): """ Generator which will yield Pages to pages that might contain text to replace. These pages will be retrieved from a local sql dump file (cur table). Arguments: * sqlfilename - the dump's path, either absolute or relative * replacements - a dictionary where old texts are keys and new texts are values * exceptions - a list of strings; pages which contain one of these won't be changed. * regex - if the entries of replacements and exceptions should be interpreted as regular expressions """ mysite = wikipedia.getSite() import sqldump dump = sqldump.SQLdump(self.sqlfilename, wikipedia.getSite().encoding()) for entry in dump.entries(): skip_page = False if self.namespace != -1 and self.namespace != entry.namespace: continue else: for exception in self.exceptions: if self.regex: exception = re.compile(exception) if exception.search(entry.text): skip_page = True break else: if entry.text.find(exception) != -1: skip_page = True break if not skip_page: for old, new in self.replacements: if self.regex: old = re.compile(old) if old.search(entry.text): yield wikipedia.Page(mysite, entry.full_title()) break else: if entry.text.find(old) != -1: yield wikipedia.Page(mysite, entry.full_title()) break def read_pages_from_category(self): """ Generator which will yield pages that are listed in a text file created by the bot operator. Will regard everything inside [[double brackets]] as a page name, and yield Pages for these pages. Arguments: * textfilename - the textfile's path, either absolute or relative """ import catlib category = catlib.Category(wikipedia.getSite(), self.categoryname) for page in category.articles(recurse = False): yield page def read_pages_from_text_file(self): """ Generator which will yield pages that are listed in a text file created by the bot operator. Will regard everything inside [[double brackets]] as a page name, and yield Pages for these pages. Arguments: * textfilename - the textfile's path, either absolute or relative """ f = open(self.textfilename, 'r') # regular expression which will find [[wiki links]] R = re.compile(r'.*\[\[([^\]]*)\]\].*') m = False for line in f.readlines(): # BUG: this will only find one link per line. # TODO: use findall() instead. m=R.match(line) if m: yield wikipedia.Page(wikipedia.getSite(), m.group(1)) f.close() def read_pages_from_wiki_page(self): ''' Generator which will yield pages that are listed in a wiki page. Will regard everything inside [[double brackets]] as a page name, except for interwiki and category links, and yield Pages for these pages. Arguments: * pagetitle - the title of a page on the home wiki ''' listpage = wikipedia.Page(wikipedia.getSite(), self.pagetitle) list = wikipedia.get(listpage) # TODO - UNFINISHED # TODO: Make MediaWiki's search feature available. def __iter__(self): ''' Starts the generator. ''' if self.source == 'sqldump': for pl in self.read_pages_from_sql_dump(): yield pl elif self.source == 'textfile': for pl in self.read_pages_from_text_file(): yield pl elif self.source == 'category': for pl in self.read_pages_from_category(): yield pl elif self.source == 'userinput': for pagename in self.pagenames: yield wikipedia.Page(wikipedia.getSite(), pagename) class ReplaceRobot: def __init__(self, generator, replacements, refsequence, references, refusage, exceptions = [], regex = False, acceptall = False): self.generator = generator self.replacements = replacements self.exceptions = exceptions self.regex = regex self.acceptall = acceptall self.references = references self.refsequence = refsequence self.refusage = refusage def checkExceptions(self, original_text): """ If one of the exceptions applies for the given text, returns the substring. which matches the exception. Otherwise it returns None. """ for exception in self.exceptions: if self.regex: exception = re.compile(exception) hit = exception.search(original_text) if hit: return hit.group(0) else: hit = original_text.find(exception) if hit != -1: return original_text[hit:hit + len(exception)] return None def doReplacements(self, new_text): """ Returns the text which is generated by applying all replacements to the given text. """ # For any additional replacements, loop through them for old, new in self.replacements: if self.regex: # TODO: compiling the regex each time might be inefficient oldR = re.compile(old) new_text = oldR.sub(new, new_text) else: new_text = new_text.replace(old, new) # Find name of Notes section. refsectionname = self.doFindRefSection( new_text ) # Get list of all sections which may contain citations. refsectionlist = self.doFindAllCitationSections( new_text, refsectionname ) # Read existing Notes section contents into references list wikipedia.output( u"Reading existing Notes section" ) self.doReadReferencesSection( new_text, refsectionname ) while self.references and self.references[len(self.references)-1] == u'\n': del self.references[len(self.references)-1] # delete trailing empty lines # Convert any external links to footnote references wikipedia.output( u"Converting external links" ) new_text = self.doConvertExternalLinks( new_text ) # Accumulate ordered list of all references wikipedia.output( u"Collecting references" ) (duplicatefound, self.refusage) = self.doBuildSequenceListOfReferences( new_text ) # Rewrite references, including dealing with duplicates. wikipedia.output( u"Rewriting references" ) new_text = self.doRewriteReferences( new_text, self.refusage, refsectionname ) # Reorder Notes to match sequence of ordered list wikipedia.output( u"Collating references" ) self.references = self.doReorderReferences( self.references, self.refusage) # Rebuild Notes section wikipedia.output( u"Rebuilding References section" ) new_text = self.doUpdateReferencesSection( new_text, self.refusage, refsectionname ) return new_text def doConvertExternalLinks(self, original_text): """ Returns the text which is generated by converting external links to References. Adds References to reference list. """ new_text = '' # Default is no text skipsection = False for text_line in original_text.splitlines(True): # Scan all text line by line # Check for protected sections m = re.search("== *(?P[^\]\|=]*) *==", text_line) # TODO: support subheadings within Notes section # TODO: support Notes in alphabetic order # TODO: support Notes in other orders if m: # if in a section, check if should skip this section if m.group('sectionname').lower().strip() in referencesectionnames: skipsection = True # skipsection left True so no further links converted if skipsection: new_text = new_text + text_line # skip section, so retain text. else: # TODO: recognize {{inline}} invisible footnotes when something can be done with them # # Ignore lines within comments if not text_line.startswith( u'': # This line ends some Notes sections intargetsection = False # flag as not being in section if text_line.strip() == u'': # This line ends some Notes sections intargetsection = False # flag as not being in section if intargetsection: # if still inside target section # Convert any # wiki list to *; will be converted later if a reference if text_line[0] == '#': text_line = '*' + text_line[1:] # replace # with * wiki self.references.append( text_line.rstrip() + u'\n' ) # Append line to references new_text = new_text + text_line.rstrip() + u'\n' return new_text def doReorderReferences(self, references, refusage): """ Returns the new references list after reordering to match refusage list Non-references are moved to top, unused references to bottom. """ # TODO: add tests for duplicate references/Ibid handling. newreferences = references if references != [] and refusage != {}: newreferences = [] for i in range(len(references)): # move nonrefs to top of list text_line = references[i] # TODO: compile search? m = re.search(r'(?i)[*#][\s]*{{(?Pnote)\|(?P[^}|]+?)}}', text_line) # Special test to ignore Footnote instructions comment. text_line_stripped = text_line.strip() if text_line_stripped.startswith(u'4) Add ') or not m: # if no ref found newreferences.append(text_line) # add nonref to new list references[i] = None refsort = {} for refkey in refusage.keys(): # build list of keys in document order refsort[ refusage[refkey][0] ] = refkey # refsort contains reference key names alphabet26 = u'abcdefghijklmnopqrstuvwxyz' for i in range(len(refsort)): # collect references in document order for search_num in range(len(references)): # find desired entry search_line = references[search_num] if search_line: # TODO: compile search? # Note that the expression finds all neighboring note|note_label expressions. m2 = re.search(r'(?i)[*#]([\s]*{{(?Pnote|note_label)\|(?P[^}|]+?)}})+', search_line) if m2: refkey = m2.group('refname').strip() if refkey == refsort[i]: # if expected ref found # Rewrite references note_text = '# {{note|%s}}' % refkey # rewrite note tag if refusage[refkey][1] > 1: # if more than one reference to citation for n in range(refusage[refkey][1]): # loop through all repetitions note_text = note_text + '{{note_label|%s|%d|%s}}' % (refkey,(refusage[refkey][0])+1,alphabet26[n%26]) search_line=search_line[:m2.start(0)] + note_text + search_line[m2.end(0):] newreferences.append(search_line) # found, add entry del references[search_num] # delete used reference break # stop the search loop after entry found newreferences = newreferences + references # append any unused references return newreferences def doUpdateReferencesSection(self, original_text, refusage, refsectionname): """ Returns the text which is generated by rebuilding the Notes section. Rewrite Notes section from references list. """ new_text = '' # Default is no text intargetsection = False for text_line in original_text.splitlines(True): # Scan all text line by line # Check for target section m = re.search( r'==+(?P[^=]+)==', text_line ) if m: # if in a section, check if Notes section if refsectionname != '': # if a certain section name has been identified m_section = m.group('sectionname') wikipedia.output( u'Looking for "%s": "%s"' % (refsectionname,m_section) ) if unicode(m_section.strip()) == unicode(refsectionname): wikipedia.output( u'Updating Ref section.' ) intargetsection = True # flag as being in section else: intargetsection = False # flag as not being in section else: # else grab all possible sections if m.group('sectionname').lower().strip() in referencesectionnames: intargetsection = True # flag as being in section else: intargetsection = False # flag as not being in section if intargetsection: new_text = new_text + text_line # append new line to new text if self.references != []: for newref in self.references: # scan through all references if newref != None: new_text = new_text + newref.rstrip() + u'\n' # insert references new_text = new_text + u'\n' # one trailing blank line self.references = [] # empty references else: new_text = new_text + text_line # copy section headline else: if intargetsection: if text_line.strip() != '': if text_line.lstrip()[0] in u'[{': # if line starts with non-Ref WikiSyntax intargetsection = False # flag as not being in section # TODO: need better way to handle special cases at end of refs if text_line.strip() == u'': # This line ends some Notes sections intargetsection = False # flag as not being in section if text_line.strip() == u'': # This line ends some Notes sections intargetsection = False # flag as not being in section if not intargetsection: # if not in Notes section, remember line new_text = new_text + text_line # append new line to new text # If references list not emptied, there was no Notes section found if self.references != []: # empty references # New Notes section needs to be created at bottom. text_line_counter = 0 # current line last_text_line_counter_value = 0 # number of last line of possible text for text_line in original_text.splitlines(True): # Search for last normal text line text_line_counter += 1 # count this line if text_line.strip() != '': if text_line.lstrip()[0].isalnum(): # if line starts with alphanumeric last_text_line_counter = text_line_counter # number of last line of possible text else: if text_line.lstrip()[0] in u'<=!|*#': # if line starts with recognized wiki char if not text_line.startswith(u'