#!/usr/bin/python # -*- coding: utf-8 -*- """ This script goes over multiple pages, searches for pages where is missing although a tag is present, and in that case adds a new references section. These command line parameters can be used to specify which pages to work on: ¶ms; -xml Retrieve information from a local XML dump (pages-articles or pages-meta-current, see http://download.wikimedia.org). Argument can also be given as "-xml:filename". -namespace:n Number or name of namespace to process. The parameter can be used multiple times. It works in combination with all other parameters, except for the -start parameter. If you e.g. want to iterate over all categories starting at M, use -start:Category:M. -always Don't prompt you for each replacement. All other parameters will be regarded as part of the title of a single page, and the bot will only work on that single page. It is strongly recommended not to run this script over the entire article namespace (using the -start) parameter, as that would consume too much bandwidth. Instead, use the -xml parameter, or use another way to generate a list of affected articles """ __version__='$Id: selflink.py 4187 2007-09-03 11:37:19Z wikipedian $' import wikipedia, pagegenerators, catlib #import editarticle import re, sys # This is required for the text that is shown when you run this script # with the parameter -help. docuReplacements = { '¶ms;': pagegenerators.parameterHelp, } # Summary messages in different languages msg = { 'ar':u'روبوت: إضافة وسم مفقود', 'de':u'Bot: Trage fehlendes nach', 'en':u'Robot: Adding missing tag', 'he':u'בוט: מוסיף תגית חסרה', 'ja':u'ロボットによる: タグを追加。', 'ko':u'봇: 이전에 없던 추가', 'lt':u'robotas: Pridedama trūkstama žymė', 'pt':u'Bot: Adicionando a tag ', 'zh':u'機器人: 增加遺失的 標籤', 'fr':u'Robot: Ajout de la balise manquante', } # References sections are usually placed before further reading / external # link sections. This dictionary defines these sections, sorted by priority. # For example, on an English wiki, the script would place the "References" # section in front of the "Further reading" section, if that existed. # Otherwise, it would try to put it in front of the "External links" section, # or if that fails, the "See also" section, etc. placeBeforeSections = { 'ar': [ # no explicit policy on where to put the references u'وصلات خارجية', u'انظر أيضا', u'ملاحظات' ], 'de': [ # no explicit policy on where to put the references u'Literatur', u'Weblinks', u'Siehe auch', u'Weblink', # bad, but common singular form of Weblinks ], 'en': [ # no explicit policy on where to put the references u'Further reading', u'External links', u'See also', u'Notes' ], 'es': [ u'Enlaces externos', u'Véase también', u'Notas', ], 'fr': [ u'Liens externes', u'Voir aussi', u'Notes' ], 'hu': [ u'Külső hivatkozások', u'Lásd még', ], 'ja':[ u'外部リンク', ], 'ko':[ # no explicit policy on where to put the references u'외부 링크', u'외부링크', u'바깥 고리', u'바깥고리', u'바깥 링크', u'바깥링크' u'외부 고리', u'외부고리' ], 'lt': [ # no explicit policy on where to put the references u'Nuorodos' ], 'pt': [ u'Ligações externas', u'Veja também', u'Notas', ], 'zh':[ u'外部連结', u'外部链接', ], } # Titles of sections where a reference tag would fit into. # The first title should be the preferred one: It's the one that # will be used when a new section has to be created. referencesSections = { 'ar': [ # not sure about which ones are preferred. u'مراجع', u'ملاحظات', ], 'de': [ u'Einzelnachweise', # The "Einzelnachweise" title is disputed, some people prefer the other variants u'Quellen', u'Quellenangaben', u'Fußnoten', ], 'en': [ # not sure about which ones are preferred. u'References', u'Footnotes', u'Notes', ], 'es': [ u'Referencias', u'Notas', ], 'fr': [ u'Références', u'References', u'Notes' ], 'he': [ u'הערות שוליים', ], 'hu': [ u'Források és jegyzetek', u'Források', u'Jegyzetek', u'Hivatkozások', u'Megjegyzések', ], 'ja':[ u'脚注', u'脚注欄', u'脚注・出典', u'出典', u'注釈', ], 'ko':[ # u'주석', u'각주' u'주석 및 참고 자료' u'주석 및 참고자료', u'주석 및 참고 출처' ], 'lt': [ # not sure about which ones are preferred. u'Šaltiniai', u'Literatūra', ], 'pt': [ u'Ligações externas', u'Veja também', ], 'zh':[ u'參考文獻', u'参考文献', u'參考資料', u'参考资料', u'資料來源', u'资料来源', u'參見', u'参见', u'參閱', u'参阅', ], } # Templates which include a tag. If there is no such template # on your wiki, you don't have to enter anything here. referencesTemplates = { 'wikipedia': { 'ar': [u'Reflist',u'ثبت المراجع',u'قائمة المراجع'], 'en': [u'Reflist',u'Refs',u'FootnotesSmall',u'Reference', u'Ref-list',u'Reference list',u'References-small',u'Reflink', u'Footnotes',u'FootnotesSmall'], 'es': ['Listaref', 'Reflist'], 'fr': [u'Références',u'Notes', u'References', u'Reflist'], 'hu': [u'reflist'], 'ja': [u'Reflist'], 'ko': [u'주석', u'Reflist'], 'lt': [u'Reflist', u'Ref', u'Litref'], 'zh': [u'Reflist'], }, } class XmlDumpNoReferencesPageGenerator: """ Generator which will yield Pages that might lack a references tag. These pages will be retrieved from a local XML dump file (pages-articles or pages-meta-current). """ def __init__(self, xmlFilename): """ Arguments: * xmlFilename - The dump's path, either absolute or relative """ self.xmlFilename = xmlFilename self.refR = re.compile('', re.IGNORECASE) # The references tab can contain additional spaces and a group attribute. self.referencesR = re.compile('', re.IGNORECASE) def __iter__(self): import xmlreader dump = xmlreader.XmlDump(self.xmlFilename) for entry in dump.parse(): text = wikipedia.removeDisabledParts(entry.text) if self.refR.search(text) and not self.referencesR.search(text): yield wikipedia.Page(wikipedia.getSite(), entry.title) class NoReferencesBot: def __init__(self, generator, always = False): self.generator = generator self.always = always self.site = wikipedia.getSite() self.refR = re.compile('', re.IGNORECASE) self.referencesR = re.compile('', re.IGNORECASE) try: self.referencesTemplates = referencesTemplates[wikipedia.getSite().family.name][wikipedia.getSite().lang] except KeyError: self.referencesTemplates = [] def lacksReferences(self, text, verbose = True): """ Checks whether or not the page is lacking a references tag. """ oldTextCleaned = wikipedia.removeDisabledParts(text) if not self.refR.search(oldTextCleaned): if verbose: wikipedia.output(u'No changes necessary: no ref tags found.') return False elif self.referencesR.search(oldTextCleaned): if verbose: wikipedia.output(u'No changes necessary: references tag found.') return False else: if self.referencesTemplates: templateR = u'{{(' + u'|'.join(self.referencesTemplates) + ')' if re.search(templateR, oldTextCleaned, re.IGNORECASE): if verbose: wikipedia.output(u'No changes necessary: references template found.') return False if verbose: wikipedia.output(u'Found ref without references.') return True def addReferences(self, oldText): """ Tries to add a references tag into an existing section where it fits into. If there is no such section, creates a new section containing the references tag. * Returns : The modified pagetext """ # Is there an existing section where we can add the references tag? for section in wikipedia.translate(self.site, referencesSections): sectionR = re.compile(r'\n=+ *%s *=+\s*\n' % section) index = 0 while index < len(oldText): match = sectionR.search(oldText, index) if match: if wikipedia.isDisabled(oldText, match.start()): wikipedia.output('Existing %s section is commented out, skipping.' % section) index = match.end() else: wikipedia.output(u'Adding references tag to existing %s section...\n' % section) newText = oldText[:match.end()] + u'\n\n' + oldText[match.end():] return newText else: break # Create a new section for the references tag for section in wikipedia.translate(self.site, placeBeforeSections): # Find out where to place the new section sectionR = re.compile(r'\n(?P=+) *%s *=+\s*\n' % section) index = 0 while index < len(oldText): match = sectionR.search(oldText, index) if match: if wikipedia.isDisabled(oldText, match.start()): wikipedia.output('Existing %s section is commented out, won\'t add the references in front of it.' % section) index = match.end() else: wikipedia.output(u'Adding references section before %s section...\n' % section) index = match.start() ident = match.group('ident') return self.createReferenceSection(oldText, index, ident) else: break # This gets complicated: we want to place the new references # section over the interwiki links and categories, but also # over all navigation bars, persondata, and other templates # that are at the bottom of the page. So we need some advanced # regex magic. # The strategy is: create a temporary copy of the text. From that, # keep removing interwiki links, templates etc. from the bottom. # At the end, look at the length of the temp text. That's the position # where we'll insert the references section. catNamespaces = '|'.join(self.site.category_namespaces()) categoryPattern = r'\[\[\s*(%s)\s*:[^\n]*\]\]\s*' % catNamespaces interwikiPattern = r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]\s*' # won't work with nested templates templatePattern = r'{{((?!}}).)+?}}\s*' # the negative lookahead assures that we'll match the last template occurence in the temp text. commentPattern = r').)*?-->\s*' metadataR = re.compile(r'(\r\n)?(%s|%s|%s|%s)$' % (categoryPattern, interwikiPattern, templatePattern, commentPattern), re.DOTALL) tmpText = oldText while True: match = metadataR.search(tmpText) if match: tmpText = tmpText[:match.start()] else: break wikipedia.output(u'Found no section that can be preceeded by a new references section. Placing it before interwiki links, categories, and bottom templates.') index = len(tmpText) return self.createReferenceSection(oldText, index) def createReferenceSection(self, oldText, index, ident = '=='): newSection = u'\n%s %s %s\n\n\n' % (ident, wikipedia.translate(self.site, referencesSections)[0], ident) return oldText[:index] + newSection + oldText[index:] def save(self, page, newText): """ Saves the page to the wiki, if the user accepts the changes made. """ wikipedia.showDiff(page.get(), newText) if not self.always: choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'Always yes'], ['y', 'N', 'a'], 'Y') if choice == 'n': return elif choice == 'a': self.always = True if self.always: try: page.put(newText) except wikipedia.EditConflict: wikipedia.output(u'Skipping %s because of edit conflict' % (page.title(),)) except wikipedia.SpamfilterError, e: wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url)) except wikipedia.LockedPage: wikipedia.output(u'Skipping %s (locked page)' % (page.title(),)) else: # Save the page in the background. No need to catch exceptions. page.put_async(newText) return def run(self): comment = wikipedia.translate(self.site, msg) wikipedia.setAction(comment) for page in self.generator: # Show the title of the page we're working on. # Highlight the title in purple. wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) try: text = page.get() except wikipedia.NoPage: wikipedia.output(u"Page %s does not exist?!" % page.aslink()) continue except wikipedia.IsRedirectPage: wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink()) continue except wikipedia.LockedPage: wikipedia.output(u"Page %s is locked?!" % page.aslink()) continue if self.lacksReferences(text): newText = self.addReferences(text) self.save(page, newText) def main(): #page generator gen = None # This temporary array is used to read the page title if one single # page to work on is specified by the arguments. pageTitle = [] # Which namespaces should be processed? # default to [] which means all namespaces will be processed namespaces = [] # Never ask before changing a page always = False # This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. genFactory = pagegenerators.GeneratorFactory() for arg in wikipedia.handleArgs(): if arg.startswith('-xml'): if len(arg) == 4: xmlFilename = wikipedia.input(u'Please enter the XML dump\'s filename:') else: xmlFilename = arg[5:] gen = XmlDumpNoReferencesPageGenerator(xmlFilename) elif arg.startswith('-namespace:'): try: namespaces.append(int(arg[11:])) except ValueError: namespaces.append(arg[11:]) elif arg == '-always': always = True else: generator = genFactory.handleArg(arg) if generator: gen = generator else: pageTitle.append(arg) if pageTitle: page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle)) gen = iter([page]) if not gen: wikipedia.showHelp('noreferences') else: if namespaces != []: gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) preloadingGen = pagegenerators.PreloadingGenerator(gen) bot = NoReferencesBot(preloadingGen, always) bot.run() if __name__ == "__main__" and wikipedia.handleUrlAndHeader(): try: wikipedia.startContent(form=True) main() finally: wikipedia.endContent() wikipedia.stopme()